Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : */
9 :
10 : #include "converter.hxx"
11 : #include "convertisciidevangari.hxx"
12 : #include "convertsinglebytetobmpunicode.hxx"
13 : #include <rtl/textcvt.h>
14 :
15 : using namespace sal::detail::textenc;
16 : using namespace rtl::textenc;
17 :
18 : struct IsciiDevanagariToUnicode
19 : {
20 : sal_uInt8 m_cPrevChar;
21 8 : IsciiDevanagariToUnicode()
22 8 : : m_cPrevChar(0)
23 : {
24 8 : }
25 0 : void reset()
26 : {
27 0 : m_cPrevChar = 0;
28 0 : }
29 : sal_Size convert(char const* pSrcBuf, sal_Size nSrcBytes,
30 : sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
31 : sal_uInt32* pInfo, sal_Size* pSrcCvtBytes);
32 : };
33 :
34 : struct UnicodeToIsciiDevanagari
35 : {
36 : sal_Unicode m_cPrevChar;
37 : sal_Unicode m_cHighSurrogate;
38 4 : UnicodeToIsciiDevanagari()
39 : : m_cPrevChar(0)
40 4 : , m_cHighSurrogate(0)
41 : {
42 4 : }
43 0 : void reset()
44 : {
45 0 : m_cPrevChar = 0;
46 0 : m_cHighSurrogate = 0;
47 0 : }
48 : sal_Size convert(sal_Unicode const* pSrcBuf, sal_Size nSrcChars,
49 : char* pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
50 : sal_uInt32 * pInfo, sal_Size * pSrcCvtChars);
51 : };
52 :
53 : static const sal_Unicode IsciiDevanagariMap[256] =
54 : {
55 : 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,
56 : 0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F,
57 : 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,
58 : 0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F,
59 : 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,
60 : 0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F,
61 : 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,
62 : 0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F,
63 : 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,
64 : 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F,
65 : 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,
66 : 0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F,
67 : 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,
68 : 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F,
69 : 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,
70 : 0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F,
71 : 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
72 : 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
73 : 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
74 : 0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
75 : 0xFFFF,0x0901,0x0902,0x0903,0x0905,0x0906,0x0907,0x0908,
76 : 0x0909,0x090A,0x090B,0x090E,0x090F,0x0910,0x090D,0x0912,
77 : 0x0913,0x0914,0x0911,0x0915,0x0916,0x0917,0x0918,0x0919,
78 : 0x091A,0x091B,0x091C,0x091D,0x091E,0x091F,0x0920,0x0921,
79 : 0x0922,0x0923,0x0924,0x0925,0x0926,0x0927,0x0928,0x0929,
80 : 0x092A,0x092B,0x092C,0x092D,0x092E,0x092F,0x095F,0x0930,
81 : 0x0931,0x0932,0x0933,0x0934,0x0935,0x0936,0x0937,0x0938,
82 : 0x0939,0xFFFF,0x093E,0x093F,0x0940,0x0941,0x0942,0x0943,
83 : 0x0946,0x0947,0x0948,0x0945,0x094A,0x094B,0x094C,0x0949,
84 : 0x094D,0x093C,0x0964,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF,
85 : 0xFFFF,0x0966,0x0967,0x0968,0x0969,0x096A,0x096B,0x096C,
86 : 0x096D,0x096E,0x096F,0xFFFF,0xFFFF,0xFFFF,0xFFFF,0xFFFF
87 : };
88 :
89 166 : sal_Size IsciiDevanagariToUnicode::convert(
90 : char const* pSrcBuf, sal_Size nSrcBytes,
91 : sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
92 : sal_uInt32* pInfo, sal_Size* pSrcCvtBytes)
93 : {
94 166 : sal_uInt32 nInfo = 0;
95 166 : sal_Size nConverted = 0;
96 166 : sal_Unicode* pDestBufPtr = pDestBuf;
97 166 : sal_Unicode* pDestBufEnd = pDestBuf + nDestChars;
98 :
99 648 : while (nConverted < nSrcBytes)
100 : {
101 316 : if (pDestBufPtr == pDestBufEnd)
102 : {
103 0 : nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
104 0 : break;
105 : }
106 :
107 316 : sal_Unicode cChar = sal_Unicode();
108 316 : sal_uInt8 nIn = static_cast<sal_uInt8>(pSrcBuf[nConverted]);
109 316 : sal_uInt8 nNext = nConverted < nSrcBytes + 1 ? static_cast<sal_uInt8>(pSrcBuf[nConverted+1]) : 0;
110 316 : bool bNormal = true;
111 316 : bool bDouble = false;
112 : //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
113 : //halant + nukta E8 E9 halant + ZWJ 094D 200D
114 316 : if (m_cPrevChar == 0xE8 && nIn == 0xE8)
115 : {
116 4 : cChar = 0x200C;
117 4 : bNormal = false;
118 : }
119 312 : else if (m_cPrevChar == 0xE8 && nIn == 0xE9)
120 : {
121 4 : cChar = 0x200D;
122 4 : bNormal = false;
123 : }
124 308 : else if (nNext == 0xE9)
125 : {
126 12 : bNormal = false;
127 12 : bDouble = true;
128 12 : switch(nIn)
129 : {
130 : case 0xA1:
131 4 : cChar = 0x0950;
132 4 : break;
133 : case 0xA6:
134 0 : cChar = 0x090C;
135 0 : break;
136 : case 0xA7:
137 0 : cChar = 0x0961;
138 0 : break;
139 : case 0xAA:
140 0 : cChar = 0x0960;
141 0 : break;
142 : case 0xB3:
143 0 : cChar = 0x0958;
144 0 : break;
145 : case 0xB4:
146 0 : cChar = 0x0959;
147 0 : break;
148 : case 0xB5:
149 0 : cChar = 0x095A;
150 0 : break;
151 : case 0xBA:
152 0 : cChar = 0x095B;
153 0 : break;
154 : case 0xBF:
155 0 : cChar = 0x095C;
156 0 : break;
157 : case 0xC0:
158 0 : cChar = 0x095D;
159 0 : break;
160 : case 0xC9:
161 0 : cChar = 0x095E;
162 0 : break;
163 : case 0xDB:
164 0 : cChar = 0x0962;
165 0 : break;
166 : case 0xDC:
167 0 : cChar = 0x0963;
168 0 : break;
169 : case 0xDF:
170 0 : cChar = 0x0944;
171 0 : break;
172 : case 0xEA:
173 4 : cChar = 0x093D;
174 4 : break;
175 : default:
176 4 : bNormal = true;
177 4 : bDouble = false;
178 4 : break;
179 : }
180 : }
181 :
182 316 : if (bNormal)
183 300 : cChar = IsciiDevanagariMap[nIn];
184 :
185 316 : bool bUndefined = cChar == 0xffff;
186 :
187 316 : if (bUndefined)
188 : {
189 : BadInputConversionAction eAction = handleBadInputTextToUnicodeConversion(
190 : bUndefined, true, 0, nFlags, &pDestBufPtr, pDestBufEnd,
191 0 : &nInfo);
192 0 : if (eAction == BAD_INPUT_CONTINUE)
193 0 : continue;
194 0 : if (eAction == BAD_INPUT_STOP)
195 0 : break;
196 0 : else if (eAction == BAD_INPUT_NO_OUTPUT)
197 : {
198 0 : nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
199 0 : break;
200 : }
201 : }
202 316 : ++nConverted;
203 316 : if (bDouble)
204 8 : ++nConverted;
205 :
206 316 : *pDestBufPtr++ = cChar;
207 316 : m_cPrevChar = bNormal ? nIn : 0;
208 : }
209 :
210 166 : if (pInfo)
211 166 : *pInfo = nInfo;
212 166 : if (pSrcCvtBytes)
213 166 : *pSrcCvtBytes = nConverted;
214 :
215 166 : return pDestBufPtr - pDestBuf;
216 : }
217 :
218 : BmpUnicodeToSingleByteRange const unicodeToISCIIEncoding[] =
219 : {
220 : { 0x0000, 0x007F - 0x0000, 0x00 }, { 0x0901, 0x0903 - 0x0901, 0xA1 },
221 : { 0x0905, 0x090B - 0x0905, 0xA4 }, { 0x090D, 0x090D - 0x090D, 0xAE },
222 : { 0x090E, 0x0910 - 0x090E, 0xAB }, { 0x0911, 0x0911 - 0x0911, 0xB2 },
223 : { 0x0912, 0x0914 - 0x0912, 0xAF }, { 0x0915, 0x092F - 0x0915, 0xB3 },
224 : { 0x0930, 0x0939 - 0x0930, 0xCF }, { 0x093C, 0x093C - 0x093C, 0xE9 },
225 : { 0x093E, 0x0943 - 0x093E, 0xDA }, { 0x0945, 0x0945 - 0x0945, 0xE3 },
226 : { 0x0946, 0x0948 - 0x0946, 0xE0 }, { 0x0949, 0x0949 - 0x0949, 0xE7 },
227 : { 0x094A, 0x094C - 0x094A, 0xE4 }, { 0x094D, 0x094D - 0x094D, 0xE8 },
228 : { 0x095F, 0x095F - 0x095F, 0xCE }, { 0x0964, 0x0964 - 0x0964, 0xEA },
229 : { 0x0966, 0x096F - 0x0966, 0xF1 }
230 : };
231 :
232 4 : sal_Size UnicodeToIsciiDevanagari::convert(sal_Unicode const* pSrcBuf, sal_Size nSrcChars,
233 : char* pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
234 : sal_uInt32 * pInfo, sal_Size* pSrcCvtChars)
235 : {
236 4 : size_t entries = SAL_N_ELEMENTS(unicodeToISCIIEncoding);
237 4 : BmpUnicodeToSingleByteRange const * ranges = unicodeToISCIIEncoding;
238 :
239 4 : sal_Unicode cHighSurrogate = m_cHighSurrogate;
240 4 : sal_uInt32 nInfo = 0;
241 4 : sal_Size nConverted = 0;
242 4 : sal_Char* pDestBufPtr = pDestBuf;
243 4 : sal_Char* pDestBufEnd = pDestBuf + nDestBytes;
244 162 : for (; nConverted < nSrcChars; ++nConverted)
245 : {
246 158 : bool bUndefined = true;
247 158 : sal_uInt32 c = *pSrcBuf++;
248 158 : sal_Char cSpecialChar = 0;
249 158 : if (cHighSurrogate == 0)
250 : {
251 158 : if (ImplIsHighSurrogate(c))
252 : {
253 0 : cHighSurrogate = static_cast< sal_Unicode >(c);
254 0 : continue;
255 : }
256 : }
257 0 : else if (ImplIsLowSurrogate(c))
258 : {
259 0 : c = ImplCombineSurrogates(cHighSurrogate, c);
260 : }
261 : else
262 : {
263 0 : bUndefined = false;
264 0 : goto bad_input;
265 : }
266 158 : if (ImplIsLowSurrogate(c) || ImplIsNoncharacter(c))
267 : {
268 0 : bUndefined = false;
269 0 : goto bad_input;
270 : }
271 :
272 : //halant + halant E8 E8 -> halant + ZWNJ 094D 200C
273 : //halant + nukta E8 E9 halant + ZWJ 094D 200D
274 158 : if (m_cPrevChar == 0x094D && c == 0x200C)
275 2 : cSpecialChar = '\xE8';
276 156 : else if (m_cPrevChar == 0x094D && c == 0x200D)
277 2 : cSpecialChar = '\xE9';
278 158 : if (cSpecialChar)
279 : {
280 4 : if (pDestBufEnd - pDestBufPtr < 1)
281 : {
282 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
283 0 : break;
284 : }
285 4 : *pDestBufPtr++ = cSpecialChar;
286 4 : m_cPrevChar = 0;
287 4 : goto done;
288 : }
289 154 : switch (c)
290 : {
291 : case 0x0950:
292 2 : cSpecialChar = '\xA1';
293 2 : break;
294 : case 0x090C:
295 0 : cSpecialChar = '\xA6';
296 0 : break;
297 : case 0x0961:
298 0 : cSpecialChar = '\xA7';
299 0 : break;
300 : case 0x0960:
301 0 : cSpecialChar = '\xAA';
302 0 : break;
303 : case 0x0958:
304 0 : cSpecialChar = '\xB3';
305 0 : break;
306 : case 0x0959:
307 0 : cSpecialChar = '\xB4';
308 0 : break;
309 : case 0x095A:
310 0 : cSpecialChar = '\xB5';
311 0 : break;
312 : case 0x095B:
313 0 : cSpecialChar = '\xBA';
314 0 : break;
315 : case 0x095C:
316 0 : cSpecialChar = '\xBF';
317 0 : break;
318 : case 0x095D:
319 0 : cSpecialChar = '\xC0';
320 0 : break;
321 : case 0x095E:
322 0 : cSpecialChar = '\xC9';
323 0 : break;
324 : case 0x0962:
325 0 : cSpecialChar = '\xDB';
326 0 : break;
327 : case 0x0963:
328 0 : cSpecialChar = '\xDC';
329 0 : break;
330 : case 0x0944:
331 0 : cSpecialChar = '\xDF';
332 0 : break;
333 : case 0x093D:
334 2 : cSpecialChar = '\xEA';
335 2 : break;
336 : default:
337 150 : break;
338 : }
339 154 : if (cSpecialChar)
340 : {
341 4 : if (pDestBufEnd - pDestBufPtr < 2)
342 : {
343 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
344 0 : break;
345 : }
346 4 : *pDestBufPtr++ = cSpecialChar;
347 4 : *pDestBufPtr++ = '\xE9';
348 4 : m_cPrevChar = 0;
349 4 : goto done;
350 : }
351 :
352 : // Linearly searching through the ranges if probably fastest, assuming
353 : // that most converted characters belong to the ASCII subset:
354 1234 : for (size_t i = 0; i < entries; ++i)
355 : {
356 1234 : if (c < ranges[i].unicode)
357 : {
358 0 : break;
359 : }
360 1234 : else if (c <= sal::static_int_cast< sal_uInt32 >(
361 1234 : ranges[i].unicode + ranges[i].range))
362 : {
363 150 : if (pDestBufEnd - pDestBufPtr < 1)
364 : {
365 0 : goto no_output;
366 : }
367 : *pDestBufPtr++ = static_cast< sal_Char >(
368 150 : ranges[i].byte + (c - ranges[i].unicode));
369 150 : m_cPrevChar = c;
370 150 : goto done;
371 : }
372 : }
373 0 : goto bad_input;
374 : done:
375 158 : cHighSurrogate = 0;
376 158 : continue;
377 : bad_input:
378 0 : switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
379 : bUndefined, c, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, 0,
380 0 : 0, 0))
381 : {
382 : case sal::detail::textenc::BAD_INPUT_STOP:
383 0 : cHighSurrogate = 0;
384 0 : break;
385 :
386 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
387 0 : cHighSurrogate = 0;
388 0 : continue;
389 :
390 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
391 0 : goto no_output;
392 : }
393 0 : break;
394 : no_output:
395 0 : --pSrcBuf;
396 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
397 0 : break;
398 : }
399 :
400 4 : if (cHighSurrogate != 0
401 0 : && ((nInfo
402 0 : & (RTL_UNICODETOTEXT_INFO_ERROR
403 : | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
404 : == 0))
405 : {
406 0 : if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
407 : {
408 0 : nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
409 : }
410 : else
411 : {
412 0 : switch (sal::detail::textenc::handleBadInputUnicodeToTextConversion(
413 : false, 0, nFlags, &pDestBufPtr, pDestBufEnd, &nInfo, 0,
414 0 : 0, 0))
415 : {
416 : case sal::detail::textenc::BAD_INPUT_STOP:
417 : case sal::detail::textenc::BAD_INPUT_CONTINUE:
418 0 : cHighSurrogate = 0;
419 0 : break;
420 :
421 : case sal::detail::textenc::BAD_INPUT_NO_OUTPUT:
422 0 : nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
423 0 : break;
424 : }
425 : }
426 : }
427 4 : m_cHighSurrogate = cHighSurrogate;
428 4 : if (pInfo)
429 4 : *pInfo = nInfo;
430 4 : if (pSrcCvtChars)
431 4 : *pSrcCvtChars = nConverted;
432 :
433 4 : return pDestBufPtr - pDestBuf;
434 : }
435 :
436 166 : sal_Size ImplConvertIsciiDevanagariToUnicode(void const*,
437 : void* pContext, char const* pSrcBuf, sal_Size nSrcBytes,
438 : sal_Unicode* pDestBuf, sal_Size nDestChars, sal_uInt32 nFlags,
439 : sal_uInt32* pInfo, sal_Size* pSrcCvtBytes)
440 : {
441 : IsciiDevanagariToUnicode *pCtx =
442 166 : static_cast<IsciiDevanagariToUnicode*>(pContext);
443 : return pCtx->convert(pSrcBuf, nSrcBytes, pDestBuf, nDestChars, nFlags,
444 166 : pInfo, pSrcCvtBytes);
445 : }
446 :
447 4 : sal_Size ImplConvertUnicodeToIsciiDevanagari(void const*,
448 : void * pContext, sal_Unicode const * pSrcBuf, sal_Size nSrcChars,
449 : char * pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags,
450 : sal_uInt32 * pInfo, sal_Size * pSrcCvtChars)
451 : {
452 : UnicodeToIsciiDevanagari *pCtx =
453 4 : static_cast<UnicodeToIsciiDevanagari*>(pContext);
454 : return pCtx->convert(pSrcBuf, nSrcChars,
455 4 : pDestBuf, nDestBytes, nFlags, pInfo, pSrcCvtChars);
456 : }
457 :
458 8 : void *ImplCreateIsciiDevanagariToUnicodeContext()
459 : {
460 8 : return new IsciiDevanagariToUnicode;
461 : }
462 :
463 8 : void ImplDestroyIsciiDevanagariToUnicodeContext(void * pContext)
464 : {
465 : IsciiDevanagariToUnicode *pCtx =
466 8 : static_cast<IsciiDevanagariToUnicode*>(pContext);
467 8 : delete pCtx;
468 8 : }
469 :
470 0 : void ImplResetIsciiDevanagariToUnicodeContext(void * pContext)
471 : {
472 : IsciiDevanagariToUnicode *pCtx =
473 0 : static_cast<IsciiDevanagariToUnicode*>(pContext);
474 0 : pCtx->reset();
475 0 : }
476 :
477 4 : void *ImplCreateUnicodeToIsciiDevanagariContext()
478 : {
479 4 : return new UnicodeToIsciiDevanagari;
480 : }
481 :
482 0 : void ImplResetUnicodeToIsciiDevanagariContext(void * pContext)
483 : {
484 : UnicodeToIsciiDevanagari *pCtx =
485 0 : static_cast<UnicodeToIsciiDevanagari*>(pContext);
486 0 : pCtx->reset();
487 0 : }
488 :
489 4 : void ImplDestroyUnicodeToIsciiDevanagariContext(void * pContext)
490 : {
491 : UnicodeToIsciiDevanagari *pCtx =
492 4 : static_cast<UnicodeToIsciiDevanagari*>(pContext);
493 4 : delete pCtx;
494 4 : }
495 :
496 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|