Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include "sal/config.h"
21 :
22 : #include "sal/types.h"
23 : #include "rtl/textcvt.h"
24 :
25 : #include "converter.hxx"
26 : #include "tcvtutf8.hxx"
27 : #include "tenchelp.hxx"
28 : #include "unichars.hxx"
29 :
30 : struct ImplUtf8ToUnicodeContext
31 : {
32 : sal_uInt32 nUtf32;
33 : int nShift;
34 : bool bCheckBom;
35 : };
36 :
37 : struct ImplUnicodeToUtf8Context
38 : {
39 : sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
40 : };
41 :
42 0 : void * ImplCreateUtf8ToUnicodeContext()
43 : {
44 0 : ImplUtf8ToUnicodeContext * p = new ImplUtf8ToUnicodeContext;
45 0 : ImplResetUtf8ToUnicodeContext(p);
46 0 : return p;
47 : }
48 :
49 0 : void ImplResetUtf8ToUnicodeContext(void * pContext)
50 : {
51 0 : if (pContext != NULL)
52 : {
53 0 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = -1;
54 0 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = true;
55 : }
56 0 : }
57 :
58 0 : void ImplDestroyUtf8ToUnicodeContext(void * pContext)
59 : {
60 0 : delete static_cast< ImplUtf8ToUnicodeContext * >(pContext);
61 0 : }
62 :
63 1147854 : sal_Size ImplConvertUtf8ToUnicode(
64 : void const * pData, void * pContext, char const * pSrcBuf,
65 : sal_Size nSrcBytes, sal_Unicode * pDestBuf, sal_Size nDestChars,
66 : sal_uInt32 nFlags, sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
67 : {
68 : /*
69 : This function is very liberal with the UTF-8 input. Accepted are:
70 : - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
71 : - surrogates (e.g., ED A0 80 to represent U+D800)
72 : - encodings with up to six bytes (everything outside the range
73 : U+0000..10FFFF is considered "undefined")
74 : The first two of these points allow this routine to translate from both
75 : RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
76 : */
77 :
78 1147854 : bool bJavaUtf8 = pData != NULL;
79 1147854 : sal_uInt32 nUtf32 = 0;
80 1147854 : int nShift = -1;
81 1147854 : bool bCheckBom = true;
82 1147854 : sal_uInt32 nInfo = 0;
83 1147854 : unsigned char const * pSrcBufPtr = (unsigned char const *) pSrcBuf;
84 1147854 : unsigned char const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
85 1147854 : sal_Unicode * pDestBufPtr = pDestBuf;
86 1147854 : sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
87 :
88 1147854 : if (pContext != NULL)
89 : {
90 0 : nUtf32 = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32;
91 0 : nShift = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift;
92 0 : bCheckBom = static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom;
93 : }
94 :
95 31185620 : while (pSrcBufPtr < pSrcBufEnd)
96 : {
97 28889912 : bool bUndefined = false;
98 28889912 : bool bConsume = true;
99 28889912 : sal_uInt32 nChar = *pSrcBufPtr++;
100 28889912 : if (nShift < 0)
101 28885959 : if (nChar <= 0x7F)
102 : {
103 28883978 : nUtf32 = nChar;
104 28883978 : goto transform;
105 : }
106 1981 : else if (nChar <= 0xBF)
107 0 : goto bad_input;
108 1981 : else if (nChar <= 0xDF)
109 : {
110 9 : nUtf32 = (nChar & 0x1F) << 6;
111 9 : nShift = 0;
112 : }
113 1972 : else if (nChar <= 0xEF)
114 : {
115 1972 : nUtf32 = (nChar & 0x0F) << 12;
116 1972 : nShift = 6;
117 : }
118 0 : else if (nChar <= 0xF7)
119 : {
120 0 : nUtf32 = (nChar & 0x07) << 18;
121 0 : nShift = 12;
122 : }
123 0 : else if (nChar <= 0xFB)
124 : {
125 0 : nUtf32 = (nChar & 0x03) << 24;
126 0 : nShift = 18;
127 : }
128 0 : else if (nChar <= 0xFD)
129 : {
130 0 : nUtf32 = (nChar & 0x01) << 30;
131 0 : nShift = 24;
132 : }
133 : else
134 0 : goto bad_input;
135 3953 : else if ((nChar & 0xC0) == 0x80)
136 : {
137 3953 : nUtf32 |= (nChar & 0x3F) << nShift;
138 3953 : if (nShift == 0)
139 1981 : goto transform;
140 : else
141 1972 : nShift -= 6;
142 : }
143 : else
144 : {
145 : /*
146 : This byte is preceded by a broken UTF-8 sequence; if this byte
147 : is neither in the range [0x80..0xBF] nor in the range
148 : [0xFE..0xFF], assume that this byte does not belong to that
149 : broken sequence, but instead starts a new, legal UTF-8 sequence:
150 : */
151 0 : bConsume = nChar >= 0xFE;
152 0 : goto bad_input;
153 : }
154 3953 : continue;
155 :
156 : transform:
157 28885959 : if (!bCheckBom || nUtf32 != 0xFEFF
158 0 : || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
159 0 : || bJavaUtf8)
160 : {
161 28885959 : if (nUtf32 <= 0xFFFF)
162 28885959 : if (pDestBufPtr != pDestBufEnd)
163 28885959 : *pDestBufPtr++ = (sal_Unicode) nUtf32;
164 : else
165 0 : goto no_output;
166 0 : else if (nUtf32 <= 0x10FFFF)
167 0 : if (pDestBufEnd - pDestBufPtr >= 2)
168 : {
169 0 : *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
170 0 : *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
171 : }
172 : else
173 0 : goto no_output;
174 : else
175 : {
176 0 : bUndefined = true;
177 0 : goto bad_input;
178 : }
179 : }
180 28885959 : nShift = -1;
181 28885959 : bCheckBom = false;
182 28885959 : continue;
183 :
184 : bad_input:
185 0 : switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
186 : bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
187 0 : &nInfo))
188 : {
189 : case sal::detail::textenc::BAD_INPUT_STOP:
190 0 : nShift = -1;
191 0 : bCheckBom = false;
192 0 : if (!bConsume)
193 0 : --pSrcBufPtr;
194 0 : break;
195 :
196 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
197 0 : nShift = -1;
198 0 : bCheckBom = false;
199 0 : if (!bConsume)
200 0 : --pSrcBufPtr;
201 0 : continue;
202 :
203 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
204 0 : goto no_output;
205 : }
206 0 : break;
207 :
208 : no_output:
209 0 : --pSrcBufPtr;
210 0 : nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
211 0 : break;
212 : }
213 :
214 1147854 : if (nShift >= 0
215 0 : && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
216 : | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
217 : == 0)
218 : {
219 0 : if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
220 0 : nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
221 : else
222 0 : switch (sal::detail::textenc::handleBadInputTextToUnicodeConversion(
223 : false, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
224 0 : &nInfo))
225 : {
226 : case sal::detail::textenc::BAD_INPUT_STOP:
227 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
228 0 : nShift = -1;
229 0 : bCheckBom = false;
230 0 : break;
231 :
232 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
233 0 : nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
234 0 : break;
235 : }
236 : }
237 :
238 1147854 : if (pContext != NULL)
239 : {
240 0 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nUtf32 = nUtf32;
241 0 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->nShift = nShift;
242 0 : static_cast< ImplUtf8ToUnicodeContext * >(pContext)->bCheckBom = bCheckBom;
243 : }
244 1147854 : if (pInfo != NULL)
245 1147854 : *pInfo = nInfo;
246 1147854 : if (pSrcCvtBytes != NULL)
247 1147854 : *pSrcCvtBytes = reinterpret_cast< char const * >(pSrcBufPtr) - pSrcBuf;
248 1147854 : return pDestBufPtr - pDestBuf;
249 : }
250 :
251 0 : void * ImplCreateUnicodeToUtf8Context()
252 : {
253 0 : ImplUnicodeToUtf8Context * p = new ImplUnicodeToUtf8Context;
254 0 : ImplResetUnicodeToUtf8Context(p);
255 0 : return p;
256 : }
257 :
258 0 : void ImplResetUnicodeToUtf8Context(void * pContext)
259 : {
260 0 : if (pContext != NULL)
261 0 : static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate = 0xFFFF;
262 0 : }
263 :
264 0 : void ImplDestroyUnicodeToUtf8Context(void * pContext)
265 : {
266 0 : delete static_cast< ImplUnicodeToUtf8Context * >(pContext);
267 0 : }
268 :
269 1530378 : sal_Size ImplConvertUnicodeToUtf8(
270 : void const * pData, void * pContext, sal_Unicode const * pSrcBuf,
271 : sal_Size nSrcChars, char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
272 : sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
273 : {
274 1530378 : bool bJavaUtf8 = pData != NULL;
275 1530378 : sal_Unicode nHighSurrogate = 0xFFFF;
276 1530378 : sal_uInt32 nInfo = 0;
277 1530378 : sal_Unicode const * pSrcBufPtr = pSrcBuf;
278 1530378 : sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
279 1530378 : char * pDestBufPtr = pDestBuf;
280 1530378 : char * pDestBufEnd = pDestBufPtr + nDestBytes;
281 :
282 1530378 : if (pContext != NULL)
283 : nHighSurrogate
284 0 : = static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate;
285 :
286 1530378 : if (nHighSurrogate == 0xFFFF)
287 : {
288 1530378 : if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
289 0 : && !bJavaUtf8)
290 : {
291 0 : if (pDestBufEnd - pDestBufPtr >= 3)
292 : {
293 : /* Write BOM (U+FEFF) as UTF-8: */
294 0 : *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xEF));
295 0 : *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBB));
296 0 : *pDestBufPtr++ = static_cast< char >(static_cast< unsigned char >(0xBF));
297 : }
298 : else
299 : {
300 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
301 0 : goto done;
302 : }
303 : }
304 1530378 : nHighSurrogate = 0;
305 : }
306 :
307 53434650 : while (pSrcBufPtr < pSrcBufEnd)
308 : {
309 50373894 : sal_uInt32 nChar = *pSrcBufPtr++;
310 50373894 : if (nHighSurrogate == 0)
311 : {
312 50373894 : if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
313 : {
314 0 : nHighSurrogate = (sal_Unicode) nChar;
315 0 : continue;
316 : }
317 : }
318 0 : else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
319 0 : nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
320 : else
321 0 : goto bad_input;
322 :
323 100747788 : if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
324 100747788 : || ImplIsNoncharacter(nChar))
325 0 : goto bad_input;
326 :
327 50373894 : if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
328 100747788 : if (pDestBufPtr != pDestBufEnd)
329 50373894 : *pDestBufPtr++ = static_cast< char >(nChar);
330 : else
331 0 : goto no_output;
332 0 : else if (nChar <= 0x7FF)
333 0 : if (pDestBufEnd - pDestBufPtr >= 2)
334 : {
335 0 : *pDestBufPtr++ = static_cast< char >(0xC0 | (nChar >> 6));
336 0 : *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
337 : }
338 : else
339 0 : goto no_output;
340 0 : else if (nChar <= 0xFFFF)
341 0 : if (pDestBufEnd - pDestBufPtr >= 3)
342 : {
343 0 : *pDestBufPtr++ = static_cast< char >(0xE0 | (nChar >> 12));
344 0 : *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
345 0 : *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
346 : }
347 : else
348 0 : goto no_output;
349 0 : else if (pDestBufEnd - pDestBufPtr >= 4)
350 : {
351 0 : *pDestBufPtr++ = static_cast< char >(0xF0 | (nChar >> 18));
352 0 : *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 12) & 0x3F));
353 0 : *pDestBufPtr++ = static_cast< char >(0x80 | ((nChar >> 6) & 0x3F));
354 0 : *pDestBufPtr++ = static_cast< char >(0x80 | (nChar & 0x3F));
355 : }
356 : else
357 0 : goto no_output;
358 50373894 : nHighSurrogate = 0;
359 50373894 : continue;
360 :
361 : bad_input:
362 0 : switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
363 : false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, NULL,
364 0 : 0, NULL))
365 : {
366 : case sal::detail::textenc::BAD_INPUT_STOP:
367 0 : nHighSurrogate = 0;
368 0 : break;
369 :
370 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
371 0 : nHighSurrogate = 0;
372 0 : continue;
373 :
374 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
375 0 : goto no_output;
376 : }
377 0 : break;
378 :
379 : no_output:
380 0 : --pSrcBufPtr;
381 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
382 0 : break;
383 : }
384 :
385 1530378 : if (nHighSurrogate != 0
386 0 : && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
387 : | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
388 : == 0)
389 : {
390 0 : if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
391 0 : nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
392 : else
393 0 : switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
394 : false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo,
395 0 : NULL, 0, NULL))
396 : {
397 : case sal::detail::textenc::BAD_INPUT_STOP:
398 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
399 0 : nHighSurrogate = 0;
400 0 : break;
401 :
402 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
403 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
404 0 : break;
405 : }
406 : }
407 :
408 : done:
409 1530378 : if (pContext != NULL)
410 : static_cast< ImplUnicodeToUtf8Context * >(pContext)->nHighSurrogate
411 0 : = nHighSurrogate;
412 1530378 : if (pInfo != NULL)
413 1530378 : *pInfo = nInfo;
414 1530378 : if (pSrcCvtChars != NULL)
415 1530378 : *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
416 1530378 : return pDestBufPtr - pDestBuf;
417 : }
418 :
419 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|