Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : */
9 :
10 : #include "converter.hxx"
11 : #include "unichars.hxx"
12 : #include "convertisciidevangari.hxx"
13 : #include "convertsinglebytetobmpunicode.hxx"
14 : #include <rtl/textcvt.h>
15 :
16 : using namespace sal::detail::textenc;
17 : using namespace rtl::textenc;
18 :
19 : struct IsciiDevanagariToUnicode
20 : {
21 : sal_uInt8 m_cPrevChar;
22 4 : IsciiDevanagariToUnicode()
23 4 : : m_cPrevChar(0)
24 : {
25 4 : }
26 0 : void reset()
27 : {
28 0 : m_cPrevChar = 0;
29 0 : }
30 : sal_Size convert(char const* pSrcBuf, sal_Size nSrcBytes,
31 : sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
32 : sal_uInt32* pInfo, sal_Size* pSrcCvtBytes);
33 : };
34 :
35 : struct UnicodeToIsciiDevanagari
36 : {
37 : sal_Unicode m_cPrevChar;
38 : sal_Unicode m_cHighSurrogate;
39 2 : UnicodeToIsciiDevanagari()
40 : : m_cPrevChar(0)
41 2 : , m_cHighSurrogate(0)
42 : {
43 2 : }
44 0 : void reset()
45 : {
46 0 : m_cPrevChar = 0;
47 0 : m_cHighSurrogate = 0;
48 0 : }
49 : sal_Size convert(sal_Unicode const* pSrcBuf, sal_Size nSrcChars,
50 : char* pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
51 : sal_uInt32 * pInfo, sal_Size * pSrcCvtChars);
52 : };
53 :
54 : static const sal_Unicode IsciiDevanagariMap[256] =
55 : {
56 : 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,
57 : 0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F,
58 : 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,
59 : 0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F,
60 : 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,
61 : 0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F,
62 : 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,
63 : 0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F,
64 : 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,
65 : 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F,
66 : 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,
67 : 0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F,
68 : 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,
69 : 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F,
70 : 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,
71 : 0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F,
72 : 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
73 : 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
74 : 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
75 : 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
76 : 0xFFFF,0x0901,0x0902,0x0903,0x0905,0x0906,0x0907,0x0908,
77 : 0x0909,0x090A,0x090B,0x090E,0x090F,0x0910,0x090D,0x0912,
78 : 0x0913,0x0914,0x0911,0x0915,0x0916,0x0917,0x0918,0x0919,
79 : 0x091A,0x091B,0x091C,0x091D,0x091E,0x091F,0x0920,0x0921,
80 : 0x0922,0x0923,0x0924,0x0925,0x0926,0x0927,0x0928,0x0929,
81 : 0x092A,0x092B,0x092C,0x092D,0x092E,0x092F,0x095F,0x0930,
82 : 0x0931,0x0932,0x0933,0x0934,0x0935,0x0936,0x0937,0x0938,
83 : 0x0939,0xFFFF,0x093E,0x093F,0x0940,0x0941,0x0942,0x0943,
84 : 0x0946,0x0947,0x0948,0x0945,0x094A,0x094B,0x094C,0x0949,
85 : 0x094D,0x093C,0x0964,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
86 : 0xFFFF,0x0966,0x0967,0x0968,0x0969,0x096A,0x096B,0x096C,
87 : 0x096D,0x096E,0x096F,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF
88 : };
89 :
90 83 : sal_Size IsciiDevanagariToUnicode::convert(
91 : char const* pSrcBuf, sal_Size nSrcBytes,
92 : sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
93 : sal_uInt32* pInfo, sal_Size* pSrcCvtBytes)
94 : {
95 83 : sal_uInt32 nInfo = 0;
96 83 : sal_Size nConverted = 0;
97 83 : sal_Unicode* pDestBufPtr = pDestBuf;
98 83 : sal_Unicode* pDestBufEnd = pDestBuf + nDestChars;
99 :
100 324 : while (nConverted < nSrcBytes)
101 : {
102 158 : if (pDestBufPtr == pDestBufEnd)
103 : {
104 0 : nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
105 0 : break;
106 : }
107 :
108 158 : sal_Unicode cChar = sal_Unicode();
109 158 : sal_uInt8 nIn = static_cast<sal_uInt8>(pSrcBuf[nConverted]);
110 158 : sal_uInt8 nNext = nConverted < nSrcBytes + 1 ? static_cast<sal_uInt8>(pSrcBuf[nConverted+1]) : 0;
111 158 : bool bNormal = true;
112 158 : bool bDouble = false;
113 : //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
114 : //halant + nukta E8 E9 halant + ZWJ 094D 200D
115 158 : if (m_cPrevChar == 0xE8 && nIn == 0xE8)
116 : {
117 2 : cChar = 0x200C;
118 2 : bNormal = false;
119 : }
120 156 : else if (m_cPrevChar == 0xE8 && nIn == 0xE9)
121 : {
122 2 : cChar = 0x200D;
123 2 : bNormal = false;
124 : }
125 154 : else if (nNext == 0xE9)
126 : {
127 6 : bNormal = false;
128 6 : bDouble = true;
129 6 : switch(nIn)
130 : {
131 : case 0xA1:
132 2 : cChar = 0x0950;
133 2 : break;
134 : case 0xA6:
135 0 : cChar = 0x090C;
136 0 : break;
137 : case 0xA7:
138 0 : cChar = 0x0961;
139 0 : break;
140 : case 0xAA:
141 0 : cChar = 0x0960;
142 0 : break;
143 : case 0xB3:
144 0 : cChar = 0x0958;
145 0 : break;
146 : case 0xB4:
147 0 : cChar = 0x0959;
148 0 : break;
149 : case 0xB5:
150 0 : cChar = 0x095A;
151 0 : break;
152 : case 0xBA:
153 0 : cChar = 0x095B;
154 0 : break;
155 : case 0xBF:
156 0 : cChar = 0x095C;
157 0 : break;
158 : case 0xC0:
159 0 : cChar = 0x095D;
160 0 : break;
161 : case 0xC9:
162 0 : cChar = 0x095E;
163 0 : break;
164 : case 0xDB:
165 0 : cChar = 0x0962;
166 0 : break;
167 : case 0xDC:
168 0 : cChar = 0x0963;
169 0 : break;
170 : case 0xDF:
171 0 : cChar = 0x0944;
172 0 : break;
173 : case 0xEA:
174 2 : cChar = 0x093D;
175 2 : break;
176 : default:
177 2 : bNormal = true;
178 2 : bDouble = false;
179 2 : break;
180 : }
181 : }
182 :
183 158 : if (bNormal)
184 150 : cChar = IsciiDevanagariMap[nIn];
185 :
186 158 : bool bUndefined = cChar == 0xffff;
187 :
188 158 : if (bUndefined)
189 : {
190 : BadInputConversionAction eAction = handleBadInputTextToUnicodeConversion(
191 : bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
192 0 : &nInfo);
193 0 : if (eAction == BAD_INPUT_CONTINUE)
194 0 : continue;
195 0 : if (eAction == BAD_INPUT_STOP)
196 0 : break;
197 0 : else if (eAction == BAD_INPUT_NO_OUTPUT)
198 : {
199 0 : nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
200 0 : break;
201 : }
202 : }
203 158 : ++nConverted;
204 158 : if (bDouble)
205 4 : ++nConverted;
206 :
207 158 : *pDestBufPtr++ = cChar;
208 158 : m_cPrevChar = bNormal ? nIn : 0;
209 : }
210 :
211 83 : if (pInfo)
212 83 : *pInfo = nInfo;
213 83 : if (pSrcCvtBytes)
214 83 : *pSrcCvtBytes = nConverted;
215 :
216 83 : return pDestBufPtr - pDestBuf;
217 : }
218 :
219 : BmpUnicodeToSingleByteRange const unicodeToISCIIEncoding[] =
220 : {
221 : { 0x0000, 0x007F - 0x0000, 0x00 }, { 0x0901, 0x0903 - 0x0901, 0xA1 },
222 : { 0x0905, 0x090B - 0x0905, 0xA4 }, { 0x090D, 0x090D - 0x090D, 0xAE },
223 : { 0x090E, 0x0910 - 0x090E, 0xAB }, { 0x0911, 0x0911 - 0x0911, 0xB2 },
224 : { 0x0912, 0x0914 - 0x0912, 0xAF }, { 0x0915, 0x092F - 0x0915, 0xB3 },
225 : { 0x0930, 0x0939 - 0x0930, 0xCF }, { 0x093C, 0x093C - 0x093C, 0xE9 },
226 : { 0x093E, 0x0943 - 0x093E, 0xDA }, { 0x0945, 0x0945 - 0x0945, 0xE3 },
227 : { 0x0946, 0x0948 - 0x0946, 0xE0 }, { 0x0949, 0x0949 - 0x0949, 0xE7 },
228 : { 0x094A, 0x094C - 0x094A, 0xE4 }, { 0x094D, 0x094D - 0x094D, 0xE8 },
229 : { 0x095F, 0x095F - 0x095F, 0xCE }, { 0x0964, 0x0964 - 0x0964, 0xEA },
230 : { 0x0966, 0x096F - 0x0966, 0xF1 }
231 : };
232 :
233 2 : sal_Size UnicodeToIsciiDevanagari::convert(sal_Unicode const* pSrcBuf, sal_Size nSrcChars,
234 : char* pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
235 : sal_uInt32 * pInfo, sal_Size* pSrcCvtChars)
236 : {
237 2 : size_t entries = SAL_N_ELEMENTS(unicodeToISCIIEncoding);
238 2 : BmpUnicodeToSingleByteRange const * ranges = unicodeToISCIIEncoding;
239 :
240 2 : sal_Unicode cHighSurrogate = m_cHighSurrogate;
241 2 : sal_uInt32 nInfo = 0;
242 2 : sal_Size nConverted = 0;
243 2 : sal_Char* pDestBufPtr = pDestBuf;
244 2 : sal_Char* pDestBufEnd = pDestBuf + nDestBytes;
245 81 : for (; nConverted < nSrcChars; ++nConverted)
246 : {
247 79 : bool bUndefined = true;
248 79 : sal_uInt32 c = *pSrcBuf++;
249 79 : sal_Char cSpecialChar = 0;
250 79 : if (cHighSurrogate == 0)
251 : {
252 79 : if (ImplIsHighSurrogate(c))
253 : {
254 0 : cHighSurrogate = static_cast< sal_Unicode >(c);
255 0 : continue;
256 : }
257 : }
258 0 : else if (ImplIsLowSurrogate(c))
259 : {
260 0 : c = ImplCombineSurrogates(cHighSurrogate, c);
261 : }
262 : else
263 : {
264 0 : bUndefined = false;
265 0 : goto bad_input;
266 : }
267 79 : if (ImplIsLowSurrogate(c) || ImplIsNoncharacter(c))
268 : {
269 0 : bUndefined = false;
270 0 : goto bad_input;
271 : }
272 :
273 : //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
274 : //halant + nukta E8 E9 halant + ZWJ 094D 200D
275 79 : if (m_cPrevChar == 0x094D && c == 0x200C)
276 1 : cSpecialChar = '\xE8';
277 78 : else if (m_cPrevChar == 0x094D && c == 0x200D)
278 1 : cSpecialChar = '\xE9';
279 79 : if (cSpecialChar)
280 : {
281 2 : if (pDestBufEnd - pDestBufPtr < 1)
282 : {
283 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
284 0 : break;
285 : }
286 2 : *pDestBufPtr++ = cSpecialChar;
287 2 : m_cPrevChar = 0;
288 2 : goto done;
289 : }
290 77 : switch (c)
291 : {
292 : case 0x0950:
293 1 : cSpecialChar = '\xA1';
294 1 : break;
295 : case 0x090C:
296 0 : cSpecialChar = '\xA6';
297 0 : break;
298 : case 0x0961:
299 0 : cSpecialChar = '\xA7';
300 0 : break;
301 : case 0x0960:
302 0 : cSpecialChar = '\xAA';
303 0 : break;
304 : case 0x0958:
305 0 : cSpecialChar = '\xB3';
306 0 : break;
307 : case 0x0959:
308 0 : cSpecialChar = '\xB4';
309 0 : break;
310 : case 0x095A:
311 0 : cSpecialChar = '\xB5';
312 0 : break;
313 : case 0x095B:
314 0 : cSpecialChar = '\xBA';
315 0 : break;
316 : case 0x095C:
317 0 : cSpecialChar = '\xBF';
318 0 : break;
319 : case 0x095D:
320 0 : cSpecialChar = '\xC0';
321 0 : break;
322 : case 0x095E:
323 0 : cSpecialChar = '\xC9';
324 0 : break;
325 : case 0x0962:
326 0 : cSpecialChar = '\xDB';
327 0 : break;
328 : case 0x0963:
329 0 : cSpecialChar = '\xDC';
330 0 : break;
331 : case 0x0944:
332 0 : cSpecialChar = '\xDF';
333 0 : break;
334 : case 0x093D:
335 1 : cSpecialChar = '\xEA';
336 1 : break;
337 : default:
338 75 : break;
339 : }
340 77 : if (cSpecialChar)
341 : {
342 2 : if (pDestBufEnd - pDestBufPtr < 2)
343 : {
344 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
345 0 : break;
346 : }
347 2 : *pDestBufPtr++ = cSpecialChar;
348 2 : *pDestBufPtr++ = '\xE9';
349 2 : m_cPrevChar = 0;
350 2 : goto done;
351 : }
352 :
353 : // Linearly searching through the ranges if probably fastest, assuming
354 : // that most converted characters belong to the ASCII subset:
355 617 : for (size_t i = 0; i < entries; ++i)
356 : {
357 617 : if (c < ranges[i].unicode)
358 : {
359 0 : break;
360 : }
361 617 : else if (c <= sal::static_int_cast< sal_uInt32 >(
362 617 : ranges[i].unicode + ranges[i].range))
363 : {
364 75 : if (pDestBufEnd - pDestBufPtr < 1)
365 : {
366 0 : goto no_output;
367 : }
368 : *pDestBufPtr++ = static_cast< sal_Char >(
369 75 : ranges[i].byte + (c - ranges[i].unicode));
370 75 : m_cPrevChar = c;
371 75 : goto done;
372 : }
373 : }
374 0 : goto bad_input;
375 : done:
376 79 : cHighSurrogate = 0;
377 79 : continue;
378 : bad_input:
379 0 : switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
380 : bUndefined, c, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, 0,
381 0 : 0, 0))
382 : {
383 : case sal::detail::textenc::BAD_INPUT_STOP:
384 0 : cHighSurrogate = 0;
385 0 : break;
386 :
387 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
388 0 : cHighSurrogate = 0;
389 0 : continue;
390 :
391 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
392 0 : goto no_output;
393 : }
394 0 : break;
395 : no_output:
396 0 : --pSrcBuf;
397 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
398 0 : break;
399 : }
400 :
401 2 : if (cHighSurrogate != 0
402 0 : && ((nInfo
403 0 : & (RTL_UNICODETOTEXT_INFO_ERROR
404 : | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
405 : == 0))
406 : {
407 0 : if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
408 : {
409 0 : nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
410 : }
411 : else
412 : {
413 0 : switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
414 : false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, 0,
415 0 : 0, 0))
416 : {
417 : case sal::detail::textenc::BAD_INPUT_STOP:
418 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
419 0 : cHighSurrogate = 0;
420 0 : break;
421 :
422 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
423 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
424 0 : break;
425 : }
426 : }
427 : }
428 2 : m_cHighSurrogate = cHighSurrogate;
429 2 : if (pInfo)
430 2 : *pInfo = nInfo;
431 2 : if (pSrcCvtChars)
432 2 : *pSrcCvtChars = nConverted;
433 :
434 2 : return pDestBufPtr - pDestBuf;
435 : }
436 :
437 83 : sal_Size ImplConvertIsciiDevanagariToUnicode(void const*,
438 : void* pContext, char const* pSrcBuf, sal_Size nSrcBytes,
439 : sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
440 : sal_uInt32* pInfo, sal_Size* pSrcCvtBytes)
441 : {
442 : IsciiDevanagariToUnicode *pCtx =
443 83 : static_cast<IsciiDevanagariToUnicode*>(pContext);
444 : return pCtx->convert(pSrcBuf, nSrcBytes, pDestBuf, nDestChars, nFlags,
445 83 : pInfo, pSrcCvtBytes);
446 : }
447 :
448 2 : sal_Size ImplConvertUnicodeToIsciiDevanagari(void const*,
449 : void * pContext, sal_Unicode const * pSrcBuf, sal_Size nSrcChars,
450 : char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
451 : sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
452 : {
453 : UnicodeToIsciiDevanagari *pCtx =
454 2 : static_cast<UnicodeToIsciiDevanagari*>(pContext);
455 : return pCtx->convert(pSrcBuf, nSrcChars,
456 2 : pDestBuf, nDestBytes, nFlags, pInfo, pSrcCvtChars);
457 : }
458 :
459 4 : void *ImplCreateIsciiDevanagariToUnicodeContext()
460 : {
461 4 : return new IsciiDevanagariToUnicode;
462 : }
463 :
464 4 : void ImplDestroyIsciiDevanagariToUnicodeContext(void * pContext)
465 : {
466 : IsciiDevanagariToUnicode *pCtx =
467 4 : static_cast<IsciiDevanagariToUnicode*>(pContext);
468 4 : delete pCtx;
469 4 : }
470 :
471 0 : void ImplResetIsciiDevanagariToUnicodeContext(void * pContext)
472 : {
473 : IsciiDevanagariToUnicode *pCtx =
474 0 : static_cast<IsciiDevanagariToUnicode*>(pContext);
475 0 : pCtx->reset();
476 0 : }
477 :
478 2 : void *ImplCreateUnicodeToIsciiDevanagariContext()
479 : {
480 2 : return new UnicodeToIsciiDevanagari;
481 : }
482 :
483 0 : void ImplResetUnicodeToIsciiDevanagariContext(void * pContext)
484 : {
485 : UnicodeToIsciiDevanagari *pCtx =
486 0 : static_cast<UnicodeToIsciiDevanagari*>(pContext);
487 0 : pCtx->reset();
488 0 : }
489 :
490 2 : void ImplDestroyUnicodeToIsciiDevanagariContext(void * pContext)
491 : {
492 : UnicodeToIsciiDevanagari *pCtx =
493 2 : static_cast<UnicodeToIsciiDevanagari*>(pContext);
494 2 : delete pCtx;
495 2 : }
496 :
497 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|