Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include "sal/config.h"
21 :
22 : #include "sal/types.h"
23 : #include "rtl/textcvt.h"
24 :
25 : #include "converter.hxx"
26 : #include "tcvtutf8.hxx"
27 : #include "tenchelp.hxx"
28 : #include "unichars.hxx"
29 :
30 : struct ImplUtf8ToUnicodeContext
31 : {
32 : sal_uInt32 nUtf32;
33 : int nShift;
34 : bool bCheckBom;
35 : };
36 :
37 : struct ImplUnicodeToUtf8Context
38 : {
39 : sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
40 : };
41 :
42 157 : void * ImplCreateUtf8ToUnicodeContext()
43 : {
44 157 : ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
45 157 : ImplResetUtf8ToUnicodeContext(p);
46 157 : return p;
47 : }
48 :
49 157 : void ImplResetUtf8ToUnicodeContext(void * pContext)
50 : {
51 157 : if (pContext != NULL)
52 : {
53 157 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
54 157 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
55 : }
56 157 : }
57 :
58 157 : void ImplDestroyUtf8ToUnicodeContext(void * pContext)
59 : {
60 157 : delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
61 157 : }
62 :
63 1216645 : sal_Size ImplConvertUtf8ToUnicode(
64 : void const * pData, void * pContext, char const * pSrcBuf,
65 : sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
66 : sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
67 : {
68 : /*
69 : This function is very liberal with the UTF-8 input. Accepted are:
70 : - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
71 : - surrogates (e.g., ED A0 80 to represent U+D800)
72 : - encodings with up to six bytes (everything outside the range
73 : U+0000..10FFFF is considered "undefined")
74 : The first two of these points allow this routine to translate from both
75 : RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
76 : */
77 :
78 1216645 : int bJavaUtf8 = pData != NULL;
79 1216645 : sal_uInt32 nUtf32 = 0;
80 1216645 : int nShift = -1;
81 1216645 : bool bCheckBom = true;
82 1216645 : sal_uInt32 nInfo = 0;
83 1216645 : sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf;
84 1216645 : sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
85 1216645 : sal_Unicode * pDestBufPtr = pDestBuf;
86 1216645 : sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
87 :
88 1216645 : if (pContext != NULL)
89 : {
90 629 : nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
91 629 : nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
92 629 : bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
93 : }
94 :
95 31431436 : while (pSrcBufPtr < pSrcBufEnd)
96 : {
97 28998202 : bool bUndefined = false;
98 28998202 : int bConsume = true;
99 28998202 : sal_uInt32 nChar = *pSrcBufPtr++;
100 28998202 : if (nShift < 0)
101 24451269 : if (nChar <= 0x7F)
102 : {
103 22167472 : nUtf32 = nChar;
104 22167472 : goto transform;
105 : }
106 2283797 : else if (nChar <= 0xBF)
107 95 : goto bad_input;
108 2283702 : else if (nChar <= 0xDF)
109 : {
110 20497 : nUtf32 = (nChar & 0x1F) << 6;
111 20497 : nShift = 0;
112 : }
113 2263205 : else if (nChar <= 0xEF)
114 : {
115 2263144 : nUtf32 = (nChar & 0x0F) << 12;
116 2263144 : nShift = 6;
117 : }
118 61 : else if (nChar <= 0xF7)
119 : {
120 27 : nUtf32 = (nChar & 0x07) << 18;
121 27 : nShift = 12;
122 : }
123 34 : else if (nChar <= 0xFB)
124 : {
125 15 : nUtf32 = (nChar & 0x03) << 24;
126 15 : nShift = 18;
127 : }
128 19 : else if (nChar <= 0xFD)
129 : {
130 13 : nUtf32 = (nChar & 0x01) << 30;
131 13 : nShift = 24;
132 : }
133 : else
134 6 : goto bad_input;
135 4546933 : else if ((nChar & 0xC0) == 0x80)
136 : {
137 4545724 : nUtf32 |= (nChar & 0x3F) << nShift;
138 4545724 : if (nShift == 0)
139 2282484 : goto transform;
140 : else
141 2263240 : nShift -= 6;
142 : }
143 : else
144 : {
145 : /*
146 : This byte is preceded by a broken UTF-8 sequence; if this byte
147 : is neither in the range [0x80..0xBF] nor in the range
148 : [0xFE..0xFF], assume that this byte does not belong to that
149 : broken sequence, but instead starts a new, legal UTF-8 sequence:
150 : */
151 1209 : bConsume = nChar >= 0xFE;
152 1209 : goto bad_input;
153 : }
154 4546936 : continue;
155 :
156 : transform:
157 24449956 : if (!bCheckBom || nUtf32 != 0xFEFF
158 10 : || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
159 6 : || bJavaUtf8)
160 : {
161 24449952 : if (nUtf32 <= 0xFFFF)
162 24449939 : if (pDestBufPtr != pDestBufEnd)
163 24449939 : *pDestBufPtr++ = (sal_Unicode) nUtf32;
164 : else
165 0 : goto no_output;
166 13 : else if (nUtf32 <= 0x10FFFF)
167 5 : if (pDestBufEnd - pDestBufPtr >= 2)
168 : {
169 5 : *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
170 5 : *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
171 : }
172 : else
173 0 : goto no_output;
174 : else
175 : {
176 8 : bUndefined = true;
177 8 : goto bad_input;
178 : }
179 : }
180 24449948 : nShift = -1;
181 24449948 : bCheckBom = false;
182 24449948 : continue;
183 :
184 : bad_input:
185 1318 : switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
186 : bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
187 1318 : &nInfo))
188 : {
189 : case sal::detail::textenc::BAD_INPUT_STOP:
190 0 : nShift = -1;
191 0 : bCheckBom = false;
192 0 : if (!bConsume)
193 0 : --pSrcBufPtr;
194 0 : break;
195 :
196 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
197 1262 : nShift = -1;
198 1262 : bCheckBom = false;
199 1262 : if (!bConsume)
200 1153 : --pSrcBufPtr;
201 1262 : continue;
202 :
203 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
204 56 : goto no_output;
205 : }
206 0 : break;
207 :
208 : no_output:
209 56 : --pSrcBufPtr;
210 56 : nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
211 56 : break;
212 : }
213 :
214 1216645 : if (nShift >= 0
215 103 : && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
216 : | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
217 : == 0)
218 : {
219 47 : if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
220 47 : nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
221 : else
222 0 : switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
223 : false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
224 0 : &nInfo))
225 : {
226 : case sal::detail::textenc::BAD_INPUT_STOP:
227 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
228 0 : nShift = -1;
229 0 : bCheckBom = false;
230 0 : break;
231 :
232 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
233 0 : nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
234 0 : break;
235 : }
236 : }
237 :
238 1216645 : if (pContext != NULL)
239 : {
240 629 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
241 629 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
242 629 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
243 : }
244 1216645 : if (pInfo != NULL)
245 1216645 : *pInfo = nInfo;
246 1216645 : if (pSrcCvtBytes != NULL)
247 1216645 : *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
248 1216645 : return pDestBufPtr - pDestBuf;
249 : }
250 :
251 1925 : void * ImplCreateUnicodeToUtf8Context()
252 : {
253 1925 : ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
254 1925 : ImplResetUnicodeToUtf8Context(p);
255 1925 : return p;
256 : }
257 :
258 1925 : void ImplResetUnicodeToUtf8Context(void * pContext)
259 : {
260 1925 : if (pContext != NULL)
261 1925 : static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
262 1925 : }
263 :
264 1925 : void ImplDestroyUnicodeToUtf8Context(void * pContext)
265 : {
266 1925 : delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
267 1925 : }
268 :
269 748011 : sal_Size ImplConvertUnicodeToUtf8(
270 : void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
271 : sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
272 : sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
273 : {
274 748011 : int bJavaUtf8 = pData != NULL;
275 748011 : sal_Unicode nHighSurrogate = 0xFFFF;
276 748011 : sal_uInt32 nInfo = 0;
277 748011 : sal_Unicode const * pSrcBufPtr = pSrcBuf;
278 748011 : sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
279 748011 : char * pDestBufPtr = pDestBuf;
280 748011 : char * pDestBufEnd = pDestBufPtr + nDestBytes;
281 :
282 748011 : if (pContext != NULL)
283 : nHighSurrogate
284 14790 : = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
285 :
286 748011 : if (nHighSurrogate == 0xFFFF)
287 : {
288 735146 : if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
289 3 : && !bJavaUtf8)
290 : {
291 2 : if (pDestBufEnd - pDestBufPtr >= 3)
292 : {
293 : /* Write BOM (U+FEFF) as UTF-8: */
294 2 : *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
295 2 : *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
296 2 : *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
297 : }
298 : else
299 : {
300 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
301 0 : goto done;
302 : }
303 : }
304 735146 : nHighSurrogate = 0;
305 : }
306 :
307 41898649 : while (pSrcBufPtr < pSrcBufEnd)
308 : {
309 40402627 : sal_uInt32 nChar = *pSrcBufPtr++;
310 40402627 : if (nHighSurrogate == 0)
311 : {
312 40402626 : if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
313 : {
314 1 : nHighSurrogate = (sal_Unicode) nChar;
315 1 : continue;
316 : }
317 : }
318 1 : else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
319 1 : nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
320 : else
321 0 : goto bad_input;
322 :
323 80805253 : if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
324 80805252 : || ImplIsNoncharacter(nChar))
325 0 : goto bad_input;
326 :
327 40402626 : if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
328 80772526 : if (pDestBufPtr != pDestBufEnd)
329 40386263 : *pDestBufPtr++ = static_cast< char >(nChar);
330 : else
331 0 : goto no_output;
332 16363 : else if (nChar <= 0x7FF)
333 1671 : if (pDestBufEnd - pDestBufPtr >= 2)
334 : {
335 1671 : *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
336 1671 : *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
337 : }
338 : else
339 0 : goto no_output;
340 14692 : else if (nChar <= 0xFFFF)
341 14691 : if (pDestBufEnd - pDestBufPtr >= 3)
342 : {
343 14691 : *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
344 14691 : *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
345 14691 : *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
346 : }
347 : else
348 0 : goto no_output;
349 1 : else if (pDestBufEnd - pDestBufPtr >= 4)
350 : {
351 1 : *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
352 1 : *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
353 1 : *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
354 1 : *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
355 : }
356 : else
357 0 : goto no_output;
358 40402626 : nHighSurrogate = 0;
359 40402626 : continue;
360 :
361 : bad_input:
362 0 : switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
363 : false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL,
364 0 : 0, NULL))
365 : {
366 : case sal::detail::textenc::BAD_INPUT_STOP:
367 0 : nHighSurrogate = 0;
368 0 : break;
369 :
370 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
371 0 : nHighSurrogate = 0;
372 0 : continue;
373 :
374 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
375 0 : goto no_output;
376 : }
377 0 : break;
378 :
379 : no_output:
380 0 : --pSrcBufPtr;
381 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
382 0 : break;
383 : }
384 :
385 748011 : if (nHighSurrogate != 0
386 0 : && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
387 : | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
388 : == 0)
389 : {
390 0 : if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
391 0 : nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
392 : else
393 0 : switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
394 : false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
395 0 : NULL, 0, NULL))
396 : {
397 : case sal::detail::textenc::BAD_INPUT_STOP:
398 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
399 0 : nHighSurrogate = 0;
400 0 : break;
401 :
402 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
403 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
404 0 : break;
405 : }
406 : }
407 :
408 : done:
409 748011 : if (pContext != NULL)
410 : static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
411 14790 : = nHighSurrogate;
412 748011 : if (pInfo != NULL)
413 748011 : *pInfo = nInfo;
414 748011 : if (pSrcCvtChars != NULL)
415 748011 : *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
416 748011 : return pDestBufPtr - pDestBuf;
417 : }
418 :
419 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|