Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include "sal/config.h"
21 :
22 : #include "sal/types.h"
23 : #include "rtl/textcvt.h"
24 :
25 : #include "converter.hxx"
26 : #include "tcvtutf8.hxx"
27 : #include "tenchelp.hxx"
28 : #include "unichars.hxx"
29 :
30 : struct ImplUtf8ToUnicodeContext
31 : {
32 : sal_uInt32 nUtf32;
33 : int nShift;
34 : bool bCheckBom;
35 : };
36 :
37 : struct ImplUnicodeToUtf8Context
38 : {
39 : sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
40 : };
41 :
42 96 : void * ImplCreateUtf8ToUnicodeContext()
43 : {
44 96 : ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
45 96 : ImplResetUtf8ToUnicodeContext(p);
46 96 : return p;
47 : }
48 :
49 108 : void ImplResetUtf8ToUnicodeContext(void * pContext)
50 : {
51 108 : if (pContext != NULL)
52 : {
53 108 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
54 108 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
55 : }
56 108 : }
57 :
58 96 : void ImplDestroyUtf8ToUnicodeContext(void * pContext)
59 : {
60 96 : delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
61 96 : }
62 :
63 1658577 : sal_Size ImplConvertUtf8ToUnicode(
64 : void const * pData, void * pContext, char const * pSrcBuf,
65 : sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
66 : sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
67 : {
68 : /*
69 : This function is very liberal with the UTF-8 input. Accepted are:
70 : - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
71 : - surrogates (e.g., ED A0 80 to represent U+D800)
72 : - encodings with up to six bytes (everything outside the range
73 : U+0000..10FFFF is considered "undefined")
74 : The first two of these points allow this routine to translate from both
75 : RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
76 : */
77 :
78 1658577 : bool bJavaUtf8 = pData != NULL;
79 1658577 : sal_uInt32 nUtf32 = 0;
80 1658577 : int nShift = -1;
81 1658577 : bool bCheckBom = true;
82 1658577 : sal_uInt32 nInfo = 0;
83 1658577 : unsigned char const * pSrcBufPtr = (unsigned char const *) pSrcBuf;
84 1658577 : unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
85 1658577 : sal_Unicode * pDestBufPtr = pDestBuf;
86 1658577 : sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
87 :
88 1658577 : if (pContext != NULL)
89 : {
90 185378 : nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
91 185378 : nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
92 185378 : bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
93 : }
94 :
95 39711428 : while (pSrcBufPtr < pSrcBufEnd)
96 : {
97 36394694 : bool bUndefined = false;
98 36394694 : bool bConsume = true;
99 36394694 : sal_uInt32 nChar = *pSrcBufPtr++;
100 36394694 : if (nShift < 0)
101 30384495 : if (nChar <= 0x7F)
102 : {
103 27052560 : nUtf32 = nChar;
104 27052560 : goto transform;
105 : }
106 3331935 : else if (nChar <= 0xBF)
107 23104 : goto bad_input;
108 3308831 : else if (nChar <= 0xDF)
109 : {
110 69491 : nUtf32 = (nChar & 0x1F) << 6;
111 69491 : nShift = 0;
112 : }
113 3239340 : else if (nChar <= 0xEF)
114 : {
115 2964824 : nUtf32 = (nChar & 0x0F) << 12;
116 2964824 : nShift = 6;
117 : }
118 274516 : else if (nChar <= 0xF7)
119 : {
120 16896 : nUtf32 = (nChar & 0x07) << 18;
121 16896 : nShift = 12;
122 : }
123 257620 : else if (nChar <= 0xFB)
124 : {
125 30 : nUtf32 = (nChar & 0x03) << 24;
126 30 : nShift = 18;
127 : }
128 257590 : else if (nChar <= 0xFD)
129 : {
130 1286 : nUtf32 = (nChar & 0x01) << 30;
131 1286 : nShift = 24;
132 : }
133 : else
134 256304 : goto bad_input;
135 6010199 : else if ((nChar & 0xC0) == 0x80)
136 : {
137 5968403 : nUtf32 |= (nChar & 0x3F) << nShift;
138 5968403 : if (nShift == 0)
139 3010723 : goto transform;
140 : else
141 2957680 : nShift -= 6;
142 : }
143 : else
144 : {
145 : /*
146 : This byte is preceded by a broken UTF-8 sequence; if this byte
147 : is neither in the range [0x80..0xBF] nor in the range
148 : [0xFE..0xFF], assume that this byte does not belong to that
149 : broken sequence, but instead starts a new, legal UTF-8 sequence:
150 : */
151 41796 : bConsume = nChar >= 0xFE;
152 41796 : goto bad_input;
153 : }
154 6010207 : continue;
155 :
156 : transform:
157 30063283 : if (!bCheckBom || nUtf32 != 0xFEFF
158 20 : || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
159 12 : || bJavaUtf8)
160 : {
161 30063275 : if (nUtf32 <= 0xFFFF)
162 30063249 : if (pDestBufPtr != pDestBufEnd)
163 30063015 : *pDestBufPtr++ = (sal_Unicode) nUtf32;
164 : else
165 234 : goto no_output;
166 26 : else if (nUtf32 <= 0x10FFFF)
167 10 : if (pDestBufEnd - pDestBufPtr >= 2)
168 : {
169 10 : *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
170 10 : *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
171 : }
172 : else
173 0 : goto no_output;
174 : else
175 : {
176 16 : bUndefined = true;
177 16 : goto bad_input;
178 : }
179 : }
180 30063033 : nShift = -1;
181 30063033 : bCheckBom = false;
182 30063033 : continue;
183 :
184 : bad_input:
185 321220 : switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
186 : bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
187 321220 : &nInfo))
188 : {
189 : case sal::detail::textenc::BAD_INPUT_STOP:
190 62 : nShift = -1;
191 62 : bCheckBom = false;
192 62 : if (!bConsume)
193 4 : --pSrcBufPtr;
194 62 : break;
195 :
196 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
197 321034 : nShift = -1;
198 321034 : bCheckBom = false;
199 321034 : if (!bConsume)
200 34876 : --pSrcBufPtr;
201 321034 : continue;
202 :
203 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
204 124 : goto no_output;
205 : }
206 62 : break;
207 :
208 : no_output:
209 358 : --pSrcBufPtr;
210 358 : nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
211 358 : break;
212 : }
213 :
214 1658577 : if (nShift >= 0
215 212 : && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
216 : | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
217 : == 0)
218 : {
219 100 : if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
220 98 : nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
221 : else
222 2 : switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
223 : false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
224 2 : &nInfo))
225 : {
226 : case sal::detail::textenc::BAD_INPUT_STOP:
227 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
228 2 : nShift = -1;
229 2 : bCheckBom = false;
230 2 : break;
231 :
232 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
233 0 : nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
234 0 : break;
235 : }
236 : }
237 :
238 1658577 : if (pContext != NULL)
239 : {
240 185378 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
241 185378 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
242 185378 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
243 : }
244 1658577 : if (pInfo != NULL)
245 1658577 : *pInfo = nInfo;
246 1658577 : if (pSrcCvtBytes != NULL)
247 1658577 : *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
248 1658577 : return pDestBufPtr - pDestBuf;
249 : }
250 :
251 4568 : void * ImplCreateUnicodeToUtf8Context()
252 : {
253 4568 : ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
254 4568 : ImplResetUnicodeToUtf8Context(p);
255 4568 : return p;
256 : }
257 :
258 4568 : void ImplResetUnicodeToUtf8Context(void * pContext)
259 : {
260 4568 : if (pContext != NULL)
261 4568 : static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
262 4568 : }
263 :
264 4568 : void ImplDestroyUnicodeToUtf8Context(void * pContext)
265 : {
266 4568 : delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
267 4568 : }
268 :
269 1284215 : sal_Size ImplConvertUnicodeToUtf8(
270 : void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
271 : sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
272 : sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
273 : {
274 1284215 : bool bJavaUtf8 = pData != NULL;
275 1284215 : sal_Unicode nHighSurrogate = 0xFFFF;
276 1284215 : sal_uInt32 nInfo = 0;
277 1284215 : sal_Unicode const * pSrcBufPtr = pSrcBuf;
278 1284215 : sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
279 1284215 : char * pDestBufPtr = pDestBuf;
280 1284215 : char * pDestBufEnd = pDestBufPtr + nDestBytes;
281 :
282 1284215 : if (pContext != NULL)
283 : nHighSurrogate
284 38122 : = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
285 :
286 1284215 : if (nHighSurrogate == 0xFFFF)
287 : {
288 1250661 : if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
289 6 : && !bJavaUtf8)
290 : {
291 4 : if (pDestBufEnd - pDestBufPtr >= 3)
292 : {
293 : /* Write BOM (U+FEFF) as UTF-8: */
294 4 : *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
295 4 : *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
296 4 : *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
297 : }
298 : else
299 : {
300 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
301 0 : goto done;
302 : }
303 : }
304 1250661 : nHighSurrogate = 0;
305 : }
306 :
307 48386242 : while (pSrcBufPtr < pSrcBufEnd)
308 : {
309 45817812 : sal_uInt32 nChar = *pSrcBufPtr++;
310 45817812 : if (nHighSurrogate == 0)
311 : {
312 45817810 : if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
313 : {
314 2 : nHighSurrogate = (sal_Unicode) nChar;
315 2 : continue;
316 : }
317 : }
318 2 : else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
319 2 : nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
320 : else
321 0 : goto bad_input;
322 :
323 91635622 : if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
324 91635620 : || ImplIsNoncharacter(nChar))
325 4 : goto bad_input;
326 :
327 45817806 : if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
328 91438808 : if (pDestBufPtr != pDestBufEnd)
329 45719404 : *pDestBufPtr++ = static_cast< char >(nChar);
330 : else
331 0 : goto no_output;
332 98402 : else if (nChar <= 0x7FF)
333 9483 : if (pDestBufEnd - pDestBufPtr >= 2)
334 : {
335 9483 : *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
336 9483 : *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
337 : }
338 : else
339 0 : goto no_output;
340 88919 : else if (nChar <= 0xFFFF)
341 88917 : if (pDestBufEnd - pDestBufPtr >= 3)
342 : {
343 88917 : *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
344 88917 : *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
345 88917 : *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
346 : }
347 : else
348 0 : goto no_output;
349 2 : else if (pDestBufEnd - pDestBufPtr >= 4)
350 : {
351 2 : *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
352 2 : *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
353 2 : *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
354 2 : *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
355 : }
356 : else
357 0 : goto no_output;
358 45817806 : nHighSurrogate = 0;
359 45817806 : continue;
360 :
361 : bad_input:
362 4 : switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
363 : false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL,
364 4 : 0, NULL))
365 : {
366 : case sal::detail::textenc::BAD_INPUT_STOP:
367 0 : nHighSurrogate = 0;
368 0 : break;
369 :
370 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
371 4 : nHighSurrogate = 0;
372 4 : continue;
373 :
374 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
375 0 : goto no_output;
376 : }
377 0 : break;
378 :
379 : no_output:
380 0 : --pSrcBufPtr;
381 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
382 0 : break;
383 : }
384 :
385 1284215 : if (nHighSurrogate != 0
386 0 : && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
387 : | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
388 : == 0)
389 : {
390 0 : if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
391 0 : nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
392 : else
393 0 : switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
394 : false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
395 0 : NULL, 0, NULL))
396 : {
397 : case sal::detail::textenc::BAD_INPUT_STOP:
398 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
399 0 : nHighSurrogate = 0;
400 0 : break;
401 :
402 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
403 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
404 0 : break;
405 : }
406 : }
407 :
408 : done:
409 1284215 : if (pContext != NULL)
410 : static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
411 38122 : = nHighSurrogate;
412 1284215 : if (pInfo != NULL)
413 1284215 : *pInfo = nInfo;
414 1284215 : if (pSrcCvtChars != NULL)
415 1284215 : *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
416 1284215 : return pDestBufPtr - pDestBuf;
417 : }
418 :
419 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|