Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include "sal/config.h"
21 :
22 : #include "sal/types.h"
23 : #include "rtl/textcvt.h"
24 :
25 : #include "converter.hxx"
26 : #include "tcvtutf8.hxx"
27 : #include "tenchelp.hxx"
28 : #include "unichars.hxx"
29 :
30 : struct ImplUtf8ToUnicodeContext
31 : {
32 : sal_uInt32 nUtf32;
33 : int nShift;
34 : bool bCheckBom;
35 : };
36 :
37 : struct ImplUnicodeToUtf8Context
38 : {
39 : sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
40 : };
41 :
42 59 : void * ImplCreateUtf8ToUnicodeContext()
43 : {
44 59 : ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
45 59 : ImplResetUtf8ToUnicodeContext(p);
46 59 : return p;
47 : }
48 :
49 71 : void ImplResetUtf8ToUnicodeContext(void * pContext)
50 : {
51 71 : if (pContext != NULL)
52 : {
53 71 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
54 71 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
55 : }
56 71 : }
57 :
58 58 : void ImplDestroyUtf8ToUnicodeContext(void * pContext)
59 : {
60 58 : delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
61 58 : }
62 :
63 1293427 : sal_Size ImplConvertUtf8ToUnicode(
64 : void const * pData, void * pContext, char const * pSrcBuf,
65 : sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
66 : sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
67 : {
68 : /*
69 : This function is very liberal with the UTF-8 input. Accepted are:
70 : - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
71 : - surrogates (e.g., ED A0 80 to represent U+D800)
72 : - encodings with up to six bytes (everything outside the range
73 : U+0000..10FFFF is considered "undefined")
74 : The first two of these points allow this routine to translate from both
75 : RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
76 : */
77 :
78 1293427 : bool bJavaUtf8 = pData != NULL;
79 1293427 : sal_uInt32 nUtf32 = 0;
80 1293427 : int nShift = -1;
81 1293427 : bool bCheckBom = true;
82 1293427 : sal_uInt32 nInfo = 0;
83 1293427 : unsigned char const * pSrcBufPtr = reinterpret_cast<unsigned char const *>(pSrcBuf);
84 1293427 : unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
85 1293427 : sal_Unicode * pDestBufPtr = pDestBuf;
86 1293427 : sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
87 :
88 1293427 : if (pContext != NULL)
89 : {
90 109825 : nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
91 109825 : nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
92 109825 : bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
93 : }
94 :
95 29716962 : while (pSrcBufPtr < pSrcBufEnd)
96 : {
97 27130196 : bool bUndefined = false;
98 27130196 : bool bConsume = true;
99 27130196 : sal_uInt32 nChar = *pSrcBufPtr++;
100 27130196 : if (nShift < 0)
101 21903158 : if (nChar <= 0x7F)
102 : {
103 19249171 : nUtf32 = nChar;
104 19249171 : goto transform;
105 : }
106 2653987 : else if (nChar <= 0xBF)
107 12870 : goto bad_input;
108 2641117 : else if (nChar <= 0xDF)
109 : {
110 43973 : nUtf32 = (nChar & 0x1F) << 6;
111 43973 : nShift = 0;
112 : }
113 2597144 : else if (nChar <= 0xEF)
114 : {
115 2591773 : nUtf32 = (nChar & 0x0F) << 12;
116 2591773 : nShift = 6;
117 : }
118 5371 : else if (nChar <= 0xF7)
119 : {
120 788 : nUtf32 = (nChar & 0x07) << 18;
121 788 : nShift = 12;
122 : }
123 4583 : else if (nChar <= 0xFB)
124 : {
125 753 : nUtf32 = (nChar & 0x03) << 24;
126 753 : nShift = 18;
127 : }
128 3830 : else if (nChar <= 0xFD)
129 : {
130 184 : nUtf32 = (nChar & 0x01) << 30;
131 184 : nShift = 24;
132 : }
133 : else
134 3646 : goto bad_input;
135 5227038 : else if ((nChar & 0xC0) == 0x80)
136 : {
137 5216715 : nUtf32 |= (nChar & 0x3F) << nShift;
138 5216715 : if (nShift == 0)
139 2627144 : goto transform;
140 : else
141 2589571 : nShift -= 6;
142 : }
143 : else
144 : {
145 : /*
146 : This byte is preceded by a broken UTF-8 sequence; if this byte
147 : is neither in the range [0x80..0xBF] nor in the range
148 : [0xFE..0xFF], assume that this byte does not belong to that
149 : broken sequence, but instead starts a new, legal UTF-8 sequence:
150 : */
151 10323 : bConsume = nChar >= 0xFE;
152 10323 : goto bad_input;
153 : }
154 5227042 : continue;
155 :
156 : transform:
157 21876315 : if (!bCheckBom || nUtf32 != 0xFEFF
158 10 : || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
159 6 : || bJavaUtf8)
160 : {
161 21876311 : if (nUtf32 <= 0xFFFF)
162 21876288 : if (pDestBufPtr != pDestBufEnd)
163 21876287 : *pDestBufPtr++ = (sal_Unicode) nUtf32;
164 : else
165 1 : goto no_output;
166 23 : else if (nUtf32 <= 0x10FFFF)
167 15 : if (pDestBufEnd - pDestBufPtr >= 2)
168 : {
169 15 : *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
170 15 : *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
171 : }
172 : else
173 0 : goto no_output;
174 : else
175 : {
176 8 : bUndefined = true;
177 8 : goto bad_input;
178 : }
179 : }
180 21876306 : nShift = -1;
181 21876306 : bCheckBom = false;
182 21876306 : continue;
183 :
184 : bad_input:
185 26847 : switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
186 : bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
187 26847 : &nInfo))
188 : {
189 : case sal::detail::textenc::BAD_INPUT_STOP:
190 31 : nShift = -1;
191 31 : bCheckBom = false;
192 31 : if (!bConsume)
193 2 : --pSrcBufPtr;
194 31 : break;
195 :
196 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
197 26760 : nShift = -1;
198 26760 : bCheckBom = false;
199 26760 : if (!bConsume)
200 9470 : --pSrcBufPtr;
201 26760 : continue;
202 :
203 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
204 56 : goto no_output;
205 : }
206 31 : break;
207 :
208 : no_output:
209 57 : --pSrcBufPtr;
210 57 : nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
211 57 : break;
212 : }
213 :
214 1293427 : if (nShift >= 0
215 106 : && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
216 : | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
217 : == 0)
218 : {
219 50 : if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
220 49 : nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
221 : else
222 1 : switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
223 : false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
224 1 : &nInfo))
225 : {
226 : case sal::detail::textenc::BAD_INPUT_STOP:
227 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
228 1 : nShift = -1;
229 1 : bCheckBom = false;
230 1 : break;
231 :
232 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
233 0 : nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
234 0 : break;
235 : }
236 : }
237 :
238 1293427 : if (pContext != NULL)
239 : {
240 109825 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
241 109825 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
242 109825 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
243 : }
244 1293427 : if (pInfo != NULL)
245 1293427 : *pInfo = nInfo;
246 1293427 : if (pSrcCvtBytes != NULL)
247 1293427 : *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
248 1293427 : return pDestBufPtr - pDestBuf;
249 : }
250 :
251 2332 : void * ImplCreateUnicodeToUtf8Context()
252 : {
253 2332 : ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
254 2332 : ImplResetUnicodeToUtf8Context(p);
255 2332 : return p;
256 : }
257 :
258 2332 : void ImplResetUnicodeToUtf8Context(void * pContext)
259 : {
260 2332 : if (pContext != NULL)
261 2332 : static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
262 2332 : }
263 :
264 2333 : void ImplDestroyUnicodeToUtf8Context(void * pContext)
265 : {
266 2333 : delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
267 2333 : }
268 :
269 876120 : sal_Size ImplConvertUnicodeToUtf8(
270 : void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
271 : sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
272 : sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
273 : {
274 876120 : bool bJavaUtf8 = pData != NULL;
275 876120 : sal_Unicode nHighSurrogate = 0xFFFF;
276 876120 : sal_uInt32 nInfo = 0;
277 876120 : sal_Unicode const * pSrcBufPtr = pSrcBuf;
278 876120 : sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
279 876120 : char * pDestBufPtr = pDestBuf;
280 876120 : char * pDestBufEnd = pDestBufPtr + nDestBytes;
281 :
282 876120 : if (pContext != NULL)
283 : nHighSurrogate
284 19855 : = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
285 :
286 876120 : if (nHighSurrogate == 0xFFFF)
287 : {
288 858597 : if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
289 3 : && !bJavaUtf8)
290 : {
291 2 : if (pDestBufEnd - pDestBufPtr >= 3)
292 : {
293 : /* Write BOM (U+FEFF) as UTF-8: */
294 2 : *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
295 2 : *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
296 2 : *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
297 : }
298 : else
299 : {
300 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
301 0 : goto done;
302 : }
303 : }
304 858597 : nHighSurrogate = 0;
305 : }
306 :
307 37654615 : while (pSrcBufPtr < pSrcBufEnd)
308 : {
309 35902375 : sal_uInt32 nChar = *pSrcBufPtr++;
310 35902375 : if (nHighSurrogate == 0)
311 : {
312 35902374 : if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
313 : {
314 1 : nHighSurrogate = (sal_Unicode) nChar;
315 1 : continue;
316 : }
317 : }
318 1 : else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
319 1 : nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
320 : else
321 0 : goto bad_input;
322 :
323 71804749 : if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
324 71804748 : || ImplIsNoncharacter(nChar))
325 2 : goto bad_input;
326 :
327 35902372 : if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
328 71668336 : if (pDestBufPtr != pDestBufEnd)
329 35834168 : *pDestBufPtr++ = static_cast< char >(nChar);
330 : else
331 0 : goto no_output;
332 68204 : else if (nChar <= 0x7FF)
333 4977 : if (pDestBufEnd - pDestBufPtr >= 2)
334 : {
335 4977 : *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
336 4977 : *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
337 : }
338 : else
339 0 : goto no_output;
340 63227 : else if (nChar <= 0xFFFF)
341 63226 : if (pDestBufEnd - pDestBufPtr >= 3)
342 : {
343 63226 : *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
344 63226 : *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
345 63226 : *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
346 : }
347 : else
348 0 : goto no_output;
349 1 : else if (pDestBufEnd - pDestBufPtr >= 4)
350 : {
351 1 : *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
352 1 : *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
353 1 : *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
354 1 : *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
355 : }
356 : else
357 0 : goto no_output;
358 35902372 : nHighSurrogate = 0;
359 35902372 : continue;
360 :
361 : bad_input:
362 2 : switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
363 : false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL,
364 2 : 0, NULL))
365 : {
366 : case sal::detail::textenc::BAD_INPUT_STOP:
367 0 : nHighSurrogate = 0;
368 0 : break;
369 :
370 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
371 2 : nHighSurrogate = 0;
372 2 : continue;
373 :
374 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
375 0 : goto no_output;
376 : }
377 0 : break;
378 :
379 : no_output:
380 0 : --pSrcBufPtr;
381 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
382 0 : break;
383 : }
384 :
385 876120 : if (nHighSurrogate != 0
386 0 : && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
387 : | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
388 : == 0)
389 : {
390 0 : if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
391 0 : nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
392 : else
393 0 : switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
394 : false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
395 0 : NULL, 0, NULL))
396 : {
397 : case sal::detail::textenc::BAD_INPUT_STOP:
398 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
399 0 : nHighSurrogate = 0;
400 0 : break;
401 :
402 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
403 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
404 0 : break;
405 : }
406 : }
407 :
408 : done:
409 876120 : if (pContext != NULL)
410 : static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
411 19855 : = nHighSurrogate;
412 876120 : if (pInfo != NULL)
413 876120 : *pInfo = nInfo;
414 876120 : if (pSrcCvtChars != NULL)
415 876120 : *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
416 876120 : return pDestBufPtr - pDestBuf;
417 : }
418 :
419 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|