TextEncodingDetectorICU.cpp source code [webkit/Source/WebCore/platform/text/TextEncodingDetectorICU.cpp]

1	/*
2	* Copyright (C) 2008, 2009 Google Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are
6	* met:
7	*
8	* * Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* * Redistributions in binary form must reproduce the above
11	* copyright notice, this list of conditions and the following disclaimer
12	* in the documentation and/or other materials provided with the
13	* distribution.
14	* * Neither the name of Google Inc. nor the names of its
15	* contributors may be used to endorse or promote products derived from
16	* this software without specific prior written permission.
17	*
18	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29	*/
30
31	#include "config.h"
32	#include "TextEncodingDetector.h"
33
34	#include "TextEncoding.h"
35	#include <unicode/ucnv.h>
36	#include <unicode/ucsdet.h>
37
38	namespace WebCore {
39
40	bool detectTextEncoding(const char* data, size_t len,
41	const char* hintEncodingName,
42	TextEncoding* detectedEncoding)
43	{
44	*detectedEncoding = TextEncoding ();
45	int matchesCount = `0`;
46	UErrorCode status = U_ZERO_ERROR;
47	UCharsetDetector* detector = ucsdet_open(&status);
48	if (U_FAILURE(status))
49	return false;
50	ucsdet_enableInputFilter(detector, true);
51	ucsdet_setText(detector, data, static_cast<int32_t>(len), &status);
52	if (U_FAILURE(status))
53	return false;
54
55	// FIXME: A few things we can do other than improving
56	// the ICU detector itself.
57	// 1. Use ucsdet_detectAll and pick the most likely one given
58	// "the context" (parent-encoding, referrer encoding, etc).
59	// 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
60	// Chinese, Japanese, Russian, Korean and Hebrew) by picking the
61	// encoding with a highest confidence among the detector-specific
62	// limited set of candidate encodings.
63	// Below is a partial implementation of the first part of what's outlined
64	// above.
65	const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
66	if (U_FAILURE(status)) {
67	ucsdet_close(detector);
68	return false;
69	}
70
71	const char* encoding = `0`;
72	if (hintEncodingName) {
73	TextEncoding hintEncoding(hintEncodingName);
74	// 10 is the minimum confidence value consistent with the codepoint
75	// allocation in a given encoding. The size of a chunk passed to
76	// us varies even for the same html file (apparently depending on
77	// the network load). When we're given a rather short chunk, we
78	// don't have a sufficiently reliable signal other than the fact that
79	// the chunk is consistent with a set of encodings. So, instead of
80	// setting an arbitrary threshold, we have to scan all the encodings
81	// consistent with the data.
82	const int32_t kThreshold = `10`;
83	for (int i = `0`; i < matchesCount; ++i) {
84	int32_t confidence = ucsdet_getConfidence(matches[i], &status);
85	if (U_FAILURE(status)) {
86	status = U_ZERO_ERROR;
87	continue;
88	}
89	if (confidence < kThreshold)
90	break;
91	const char* matchEncoding = ucsdet_getName(matches[i], &status);
92	if (U_FAILURE(status)) {
93	status = U_ZERO_ERROR;
94	continue;
95	}
96	if (TextEncoding (matchEncoding) == hintEncoding) {
97	encoding = hintEncodingName;
98	break;
99	}
100	}
101	}
102	// If no match is found so far, just pick the top match.
103	// This can happen, say, when a parent frame in EUC-JP refers to
104	// a child frame in Shift_JIS and both frames do NOT specify the encoding
105	// making us resort to auto-detection (when it IS turned on).
106	if (!encoding && matchesCount > `0`)
107	encoding = ucsdet_getName(matches[`0`], &status);
108	if (U_SUCCESS(status)) {
109	*detectedEncoding = TextEncoding (encoding);
110	ucsdet_close(detector);
111	return true;
112	}
113	ucsdet_close(detector);
114	return false;
115	}
116
117	}
118

Browse the source code of webkit/Source/WebCore/platform/text/TextEncodingDetectorICU.cpp