Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include <boost/scoped_ptr.hpp>
21 : #include <com/sun/star/i18n/UnicodeType.hpp>
22 : #include <com/sun/star/i18n/KCharacterType.hpp>
23 : #include <com/sun/star/i18n/ScriptType.hpp>
24 : #include <i18nlangtag/languagetag.hxx>
25 : #include <i18nlangtag/languagetagicu.hxx>
26 : #include <i18nutil/unicode.hxx>
27 : #include <sal/log.hxx>
28 : #include <unicode/numfmt.h>
29 : #include "unicode_data.h"
30 :
31 : // Workaround for glibc braindamage:
32 : // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
33 : // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
34 : #undef CURRENCY_SYMBOL
35 :
36 : using namespace ::com::sun::star::i18n;
37 :
38 : static const ScriptTypeList defaultTypeList[] = {
39 : { UnicodeScript_kBasicLatin,
40 : UnicodeScript_kBasicLatin,
41 : UnicodeScript_kBasicLatin }, // 0,
42 : { UnicodeScript_kLatin1Supplement,
43 : UnicodeScript_kLatin1Supplement,
44 : UnicodeScript_kLatin1Supplement },// 1,
45 : { UnicodeScript_kLatinExtendedA,
46 : UnicodeScript_kLatinExtendedA,
47 : UnicodeScript_kLatinExtendedA }, // 2,
48 : { UnicodeScript_kLatinExtendedB,
49 : UnicodeScript_kLatinExtendedB,
50 : UnicodeScript_kLatinExtendedB }, // 3,
51 : { UnicodeScript_kIPAExtension,
52 : UnicodeScript_kIPAExtension,
53 : UnicodeScript_kIPAExtension }, // 4,
54 : { UnicodeScript_kSpacingModifier,
55 : UnicodeScript_kSpacingModifier,
56 : UnicodeScript_kSpacingModifier }, // 5,
57 : { UnicodeScript_kCombiningDiacritical,
58 : UnicodeScript_kCombiningDiacritical,
59 : UnicodeScript_kCombiningDiacritical }, // 6,
60 : { UnicodeScript_kGreek,
61 : UnicodeScript_kGreek,
62 : UnicodeScript_kGreek }, // 7,
63 : { UnicodeScript_kCyrillic,
64 : UnicodeScript_kCyrillic,
65 : UnicodeScript_kCyrillic }, // 8,
66 : { UnicodeScript_kArmenian,
67 : UnicodeScript_kArmenian,
68 : UnicodeScript_kArmenian }, // 9,
69 : { UnicodeScript_kHebrew,
70 : UnicodeScript_kHebrew,
71 : UnicodeScript_kHebrew }, // 10,
72 : { UnicodeScript_kArabic,
73 : UnicodeScript_kArabic,
74 : UnicodeScript_kArabic }, // 11,
75 : { UnicodeScript_kSyriac,
76 : UnicodeScript_kSyriac,
77 : UnicodeScript_kSyriac }, // 12,
78 : { UnicodeScript_kThaana,
79 : UnicodeScript_kThaana,
80 : UnicodeScript_kThaana }, // 13,
81 : { UnicodeScript_kDevanagari,
82 : UnicodeScript_kDevanagari,
83 : UnicodeScript_kDevanagari }, // 14,
84 : { UnicodeScript_kBengali,
85 : UnicodeScript_kBengali,
86 : UnicodeScript_kBengali }, // 15,
87 : { UnicodeScript_kGurmukhi,
88 : UnicodeScript_kGurmukhi,
89 : UnicodeScript_kGurmukhi }, // 16,
90 : { UnicodeScript_kGujarati,
91 : UnicodeScript_kGujarati,
92 : UnicodeScript_kGujarati }, // 17,
93 : { UnicodeScript_kOriya,
94 : UnicodeScript_kOriya,
95 : UnicodeScript_kOriya }, // 18,
96 : { UnicodeScript_kTamil,
97 : UnicodeScript_kTamil,
98 : UnicodeScript_kTamil }, // 19,
99 : { UnicodeScript_kTelugu,
100 : UnicodeScript_kTelugu,
101 : UnicodeScript_kTelugu }, // 20,
102 : { UnicodeScript_kKannada,
103 : UnicodeScript_kKannada,
104 : UnicodeScript_kKannada }, // 21,
105 : { UnicodeScript_kMalayalam,
106 : UnicodeScript_kMalayalam,
107 : UnicodeScript_kMalayalam }, // 22,
108 : { UnicodeScript_kSinhala,
109 : UnicodeScript_kSinhala,
110 : UnicodeScript_kSinhala }, // 23,
111 : { UnicodeScript_kThai,
112 : UnicodeScript_kThai,
113 : UnicodeScript_kThai }, // 24,
114 : { UnicodeScript_kLao,
115 : UnicodeScript_kLao,
116 : UnicodeScript_kLao }, // 25,
117 : { UnicodeScript_kTibetan,
118 : UnicodeScript_kTibetan,
119 : UnicodeScript_kTibetan }, // 26,
120 : { UnicodeScript_kMyanmar,
121 : UnicodeScript_kMyanmar,
122 : UnicodeScript_kMyanmar }, // 27,
123 : { UnicodeScript_kGeorgian,
124 : UnicodeScript_kGeorgian,
125 : UnicodeScript_kGeorgian }, // 28,
126 : { UnicodeScript_kHangulJamo,
127 : UnicodeScript_kHangulJamo,
128 : UnicodeScript_kHangulJamo }, // 29,
129 : { UnicodeScript_kEthiopic,
130 : UnicodeScript_kEthiopic,
131 : UnicodeScript_kEthiopic }, // 30,
132 : { UnicodeScript_kCherokee,
133 : UnicodeScript_kCherokee,
134 : UnicodeScript_kCherokee }, // 31,
135 : { UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
136 : UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
137 : UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32,
138 : { UnicodeScript_kOgham,
139 : UnicodeScript_kOgham,
140 : UnicodeScript_kOgham }, // 33,
141 : { UnicodeScript_kRunic,
142 : UnicodeScript_kRunic,
143 : UnicodeScript_kRunic }, // 34,
144 : { UnicodeScript_kKhmer,
145 : UnicodeScript_kKhmer,
146 : UnicodeScript_kKhmer }, // 35,
147 : { UnicodeScript_kMongolian,
148 : UnicodeScript_kMongolian,
149 : UnicodeScript_kMongolian }, // 36,
150 : { UnicodeScript_kLatinExtendedAdditional,
151 : UnicodeScript_kLatinExtendedAdditional,
152 : UnicodeScript_kLatinExtendedAdditional }, // 37,
153 : { UnicodeScript_kGreekExtended,
154 : UnicodeScript_kGreekExtended,
155 : UnicodeScript_kGreekExtended }, // 38,
156 : { UnicodeScript_kGeneralPunctuation,
157 : UnicodeScript_kGeneralPunctuation,
158 : UnicodeScript_kGeneralPunctuation }, // 39,
159 : { UnicodeScript_kSuperSubScript,
160 : UnicodeScript_kSuperSubScript,
161 : UnicodeScript_kSuperSubScript }, // 40,
162 : { UnicodeScript_kCurrencySymbolScript,
163 : UnicodeScript_kCurrencySymbolScript,
164 : UnicodeScript_kCurrencySymbolScript }, // 41,
165 : { UnicodeScript_kSymbolCombiningMark,
166 : UnicodeScript_kSymbolCombiningMark,
167 : UnicodeScript_kSymbolCombiningMark }, // 42,
168 : { UnicodeScript_kLetterlikeSymbol,
169 : UnicodeScript_kLetterlikeSymbol,
170 : UnicodeScript_kLetterlikeSymbol }, // 43,
171 : { UnicodeScript_kNumberForm,
172 : UnicodeScript_kNumberForm,
173 : UnicodeScript_kNumberForm }, // 44,
174 : { UnicodeScript_kArrow,
175 : UnicodeScript_kArrow,
176 : UnicodeScript_kArrow }, // 45,
177 : { UnicodeScript_kMathOperator,
178 : UnicodeScript_kMathOperator,
179 : UnicodeScript_kMathOperator }, // 46,
180 : { UnicodeScript_kMiscTechnical,
181 : UnicodeScript_kMiscTechnical,
182 : UnicodeScript_kMiscTechnical }, // 47,
183 : { UnicodeScript_kControlPicture,
184 : UnicodeScript_kControlPicture,
185 : UnicodeScript_kControlPicture }, // 48,
186 : { UnicodeScript_kOpticalCharacter,
187 : UnicodeScript_kOpticalCharacter,
188 : UnicodeScript_kOpticalCharacter }, // 49,
189 : { UnicodeScript_kEnclosedAlphanumeric,
190 : UnicodeScript_kEnclosedAlphanumeric,
191 : UnicodeScript_kEnclosedAlphanumeric }, // 50,
192 : { UnicodeScript_kBoxDrawing,
193 : UnicodeScript_kBoxDrawing,
194 : UnicodeScript_kBoxDrawing }, // 51,
195 : { UnicodeScript_kBlockElement,
196 : UnicodeScript_kBlockElement,
197 : UnicodeScript_kBlockElement }, // 52,
198 : { UnicodeScript_kGeometricShape,
199 : UnicodeScript_kGeometricShape,
200 : UnicodeScript_kGeometricShape }, // 53,
201 : { UnicodeScript_kMiscSymbol,
202 : UnicodeScript_kMiscSymbol,
203 : UnicodeScript_kMiscSymbol }, // 54,
204 : { UnicodeScript_kDingbat,
205 : UnicodeScript_kDingbat,
206 : UnicodeScript_kDingbat }, // 55,
207 : { UnicodeScript_kBraillePatterns,
208 : UnicodeScript_kBraillePatterns,
209 : UnicodeScript_kBraillePatterns }, // 56,
210 : { UnicodeScript_kCJKRadicalsSupplement,
211 : UnicodeScript_kCJKRadicalsSupplement,
212 : UnicodeScript_kCJKRadicalsSupplement }, // 57,
213 : { UnicodeScript_kKangxiRadicals,
214 : UnicodeScript_kKangxiRadicals,
215 : UnicodeScript_kKangxiRadicals }, // 58,
216 : { UnicodeScript_kIdeographicDescriptionCharacters,
217 : UnicodeScript_kIdeographicDescriptionCharacters,
218 : UnicodeScript_kIdeographicDescriptionCharacters }, // 59,
219 : { UnicodeScript_kCJKSymbolPunctuation,
220 : UnicodeScript_kCJKSymbolPunctuation,
221 : UnicodeScript_kCJKSymbolPunctuation }, // 60,
222 : { UnicodeScript_kHiragana,
223 : UnicodeScript_kHiragana,
224 : UnicodeScript_kHiragana }, // 61,
225 : { UnicodeScript_kKatakana,
226 : UnicodeScript_kKatakana,
227 : UnicodeScript_kKatakana }, // 62,
228 : { UnicodeScript_kBopomofo,
229 : UnicodeScript_kBopomofo,
230 : UnicodeScript_kBopomofo }, // 63,
231 : { UnicodeScript_kHangulCompatibilityJamo,
232 : UnicodeScript_kHangulCompatibilityJamo,
233 : UnicodeScript_kHangulCompatibilityJamo }, // 64,
234 : { UnicodeScript_kKanbun,
235 : UnicodeScript_kKanbun,
236 : UnicodeScript_kKanbun }, // 65,
237 : { UnicodeScript_kBopomofoExtended,
238 : UnicodeScript_kBopomofoExtended,
239 : UnicodeScript_kBopomofoExtended }, // 66,
240 : { UnicodeScript_kEnclosedCJKLetterMonth,
241 : UnicodeScript_kEnclosedCJKLetterMonth,
242 : UnicodeScript_kEnclosedCJKLetterMonth }, // 67,
243 : { UnicodeScript_kCJKCompatibility,
244 : UnicodeScript_kCJKCompatibility,
245 : UnicodeScript_kCJKCompatibility }, // 68,
246 : { UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
247 : UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
248 : UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69,
249 : { UnicodeScript_kCJKUnifiedIdeograph,
250 : UnicodeScript_kCJKUnifiedIdeograph,
251 : UnicodeScript_kCJKUnifiedIdeograph }, // 70,
252 : { UnicodeScript_kYiSyllables,
253 : UnicodeScript_kYiSyllables,
254 : UnicodeScript_kYiSyllables }, // 71,
255 : { UnicodeScript_kYiRadicals,
256 : UnicodeScript_kYiRadicals,
257 : UnicodeScript_kYiRadicals }, // 72,
258 : { UnicodeScript_kHangulSyllable,
259 : UnicodeScript_kHangulSyllable,
260 : UnicodeScript_kHangulSyllable }, // 73,
261 : { UnicodeScript_kHighSurrogate,
262 : UnicodeScript_kHighSurrogate,
263 : UnicodeScript_kHighSurrogate }, // 74,
264 : { UnicodeScript_kHighPrivateUseSurrogate,
265 : UnicodeScript_kHighPrivateUseSurrogate,
266 : UnicodeScript_kHighPrivateUseSurrogate }, // 75,
267 : { UnicodeScript_kLowSurrogate,
268 : UnicodeScript_kLowSurrogate,
269 : UnicodeScript_kLowSurrogate }, // 76,
270 : { UnicodeScript_kPrivateUse,
271 : UnicodeScript_kPrivateUse,
272 : UnicodeScript_kPrivateUse }, // 77,
273 : { UnicodeScript_kCJKCompatibilityIdeograph,
274 : UnicodeScript_kCJKCompatibilityIdeograph,
275 : UnicodeScript_kCJKCompatibilityIdeograph }, // 78,
276 : { UnicodeScript_kAlphabeticPresentation,
277 : UnicodeScript_kAlphabeticPresentation,
278 : UnicodeScript_kAlphabeticPresentation }, // 79,
279 : { UnicodeScript_kArabicPresentationA,
280 : UnicodeScript_kArabicPresentationA,
281 : UnicodeScript_kArabicPresentationA }, // 80,
282 : { UnicodeScript_kCombiningHalfMark,
283 : UnicodeScript_kCombiningHalfMark,
284 : UnicodeScript_kCombiningHalfMark }, // 81,
285 : { UnicodeScript_kCJKCompatibilityForm,
286 : UnicodeScript_kCJKCompatibilityForm,
287 : UnicodeScript_kCJKCompatibilityForm }, // 82,
288 : { UnicodeScript_kSmallFormVariant,
289 : UnicodeScript_kSmallFormVariant,
290 : UnicodeScript_kSmallFormVariant }, // 83,
291 : { UnicodeScript_kArabicPresentationB,
292 : UnicodeScript_kArabicPresentationB,
293 : UnicodeScript_kArabicPresentationB }, // 84,
294 : { UnicodeScript_kNoScript,
295 : UnicodeScript_kNoScript,
296 : UnicodeScript_kNoScript }, // 85,
297 : { UnicodeScript_kHalfwidthFullwidthForm,
298 : UnicodeScript_kHalfwidthFullwidthForm,
299 : UnicodeScript_kHalfwidthFullwidthForm }, // 86,
300 : { UnicodeScript_kScriptCount,
301 : UnicodeScript_kScriptCount,
302 : UnicodeScript_kNoScript } // 87,
303 : };
304 :
305 : sal_Int16 SAL_CALL
306 6056 : unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
307 :
308 6056 : if (!typeList) {
309 0 : typeList = defaultTypeList;
310 0 : unknownType = UnicodeScript_kNoScript;
311 : }
312 :
313 6056 : sal_Int16 i = 0, type = typeList[0].to;
314 12959 : while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) {
315 847 : type = typeList[++i].to;
316 : }
317 :
318 5857 : return (type < UnicodeScript_kScriptCount &&
319 5857 : ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ?
320 6119 : typeList[i].value : unknownType;
321 : }
322 :
323 : sal_Unicode SAL_CALL
324 10 : unicode::getUnicodeScriptStart( UnicodeScript type) {
325 10 : return UnicodeScriptType[type][UnicodeScriptTypeFrom];
326 : }
327 :
328 : sal_Unicode SAL_CALL
329 10 : unicode::getUnicodeScriptEnd( UnicodeScript type) {
330 10 : return UnicodeScriptType[type][UnicodeScriptTypeTo];
331 : }
332 :
333 : sal_Int16 SAL_CALL
334 127550 : unicode::getUnicodeType( const sal_Unicode ch ) {
335 : static sal_Unicode c = 0x00;
336 : static sal_Int16 r = 0x00;
337 :
338 127550 : if (ch == c) return r;
339 25483 : else c = ch;
340 :
341 25483 : sal_Int16 address = UnicodeTypeIndex[ch >> 8];
342 0 : return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] :
343 25483 : UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
344 : }
345 :
346 : sal_uInt8 SAL_CALL
347 0 : unicode::getUnicodeDirection( const sal_Unicode ch ) {
348 : static sal_Unicode c = 0x00;
349 : static sal_uInt8 r = 0x00;
350 :
351 0 : if (ch == c) return r;
352 0 : else c = ch;
353 :
354 0 : sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
355 0 : return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] :
356 0 : UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]);
357 :
358 : }
359 :
360 : #define bit(name) (1U << name)
361 :
362 : #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
363 :
364 : #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
365 :
366 : #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
367 :
368 : #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
369 : bit(UnicodeType::MODIFIER_LETTER)|\
370 : bit(UnicodeType::OTHER_LETTER)
371 :
372 : #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
373 : bit(UnicodeType::LINE_SEPARATOR)|\
374 : bit(UnicodeType::PARAGRAPH_SEPARATOR)
375 :
376 : #define CONTROLMASK bit(UnicodeType::CONTROL)|\
377 : bit(UnicodeType::FORMAT)|\
378 : bit(UnicodeType::LINE_SEPARATOR)|\
379 : bit(UnicodeType::PARAGRAPH_SEPARATOR)
380 :
381 : #define IsType(func, mask) \
382 : bool SAL_CALL func( const sal_Unicode ch) {\
383 : return (bit(getUnicodeType(ch)) & (mask)) != 0;\
384 : }
385 :
386 63457 : IsType(unicode::isControl, CONTROLMASK)
387 64061 : IsType(unicode::isAlpha, ALPHAMASK)
388 20 : IsType(unicode::isSpace, SPACEMASK)
389 :
390 : #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
391 : bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
392 :
393 20 : bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) {
394 20 : return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
395 : }
396 :
397 284417 : sal_Int16 SAL_CALL unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
398 : {
399 : //See unicode/uscript.h
400 : static const sal_Int16 scriptTypes[] =
401 : {
402 : ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
403 : ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
404 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
405 : // 15
406 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
407 : ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
408 : ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
409 : // 30
410 : ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
411 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
412 : ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
413 : // 45
414 : ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
415 : ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
416 : ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
417 : // 60
418 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
419 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
420 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
421 : // 75
422 : ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
423 : ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
424 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
425 : // 90
426 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
427 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
428 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
429 : // 105
430 : ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
431 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
432 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
433 : // 120
434 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
435 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
436 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
437 : // 135
438 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
439 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
440 : ScriptType::COMPLEX,
441 : ScriptType::WEAK
442 : };
443 :
444 : sal_Int16 nRet;
445 284417 : if (eScript < USCRIPT_COMMON)
446 0 : nRet = ScriptType::WEAK;
447 284417 : else if (static_cast<size_t>(eScript) >= SAL_N_ELEMENTS(scriptTypes))
448 0 : nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild
449 : else
450 284417 : nRet = scriptTypes[eScript];
451 284417 : return nRet;
452 : }
453 :
454 14 : OString SAL_CALL unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
455 : {
456 14 : OString sRet;
457 14 : switch (eScript)
458 : {
459 : case USCRIPT_CODE_LIMIT:
460 : case USCRIPT_INVALID_CODE:
461 0 : sRet = "zxx";
462 0 : break;
463 : case USCRIPT_COMMON:
464 : case USCRIPT_INHERITED:
465 0 : sRet = "und";
466 0 : break;
467 : case USCRIPT_MATHEMATICAL_NOTATION:
468 : case USCRIPT_SYMBOLS:
469 0 : sRet = "zxx";
470 0 : break;
471 : case USCRIPT_UNWRITTEN_LANGUAGES:
472 : case USCRIPT_UNKNOWN:
473 0 : sRet = "und";
474 0 : break;
475 : case USCRIPT_ARABIC:
476 0 : sRet = "ar";
477 0 : break;
478 : case USCRIPT_ARMENIAN:
479 0 : sRet = "hy";
480 0 : break;
481 : case USCRIPT_BENGALI:
482 0 : sRet = "bn";
483 0 : break;
484 : case USCRIPT_BOPOMOFO:
485 0 : sRet = "zh";
486 0 : break;
487 : case USCRIPT_CHEROKEE:
488 0 : sRet = "chr";
489 0 : break;
490 : case USCRIPT_COPTIC:
491 0 : sRet = "cop";
492 0 : break;
493 : case USCRIPT_CYRILLIC:
494 0 : sRet = "ru";
495 0 : break;
496 : case USCRIPT_DESERET:
497 0 : sRet = "en";
498 0 : break;
499 : case USCRIPT_DEVANAGARI:
500 0 : sRet = "hi";
501 0 : break;
502 : case USCRIPT_ETHIOPIC:
503 0 : sRet = "am";
504 0 : break;
505 : case USCRIPT_GEORGIAN:
506 0 : sRet = "ka";
507 0 : break;
508 : case USCRIPT_GOTHIC:
509 0 : sRet = "got";
510 0 : break;
511 : case USCRIPT_GREEK:
512 0 : sRet = "el";
513 0 : break;
514 : case USCRIPT_GUJARATI:
515 0 : sRet = "gu";
516 0 : break;
517 : case USCRIPT_GURMUKHI:
518 0 : sRet = "pa";
519 0 : break;
520 : case USCRIPT_HAN:
521 14 : sRet = "zh";
522 14 : break;
523 : case USCRIPT_HANGUL:
524 0 : sRet = "ko";
525 0 : break;
526 : case USCRIPT_HEBREW:
527 0 : sRet = "hr";
528 0 : break;
529 : case USCRIPT_HIRAGANA:
530 0 : sRet = "ja";
531 0 : break;
532 : case USCRIPT_KANNADA:
533 0 : sRet = "kn";
534 0 : break;
535 : case USCRIPT_KATAKANA:
536 0 : sRet = "ja";
537 0 : break;
538 : case USCRIPT_KHMER:
539 0 : sRet = "km";
540 0 : break;
541 : case USCRIPT_LAO:
542 0 : sRet = "lo";
543 0 : break;
544 : case USCRIPT_LATIN:
545 0 : sRet = "en";
546 0 : break;
547 : case USCRIPT_MALAYALAM:
548 0 : sRet = "ml";
549 0 : break;
550 : case USCRIPT_MONGOLIAN:
551 0 : sRet = "mn";
552 0 : break;
553 : case USCRIPT_MYANMAR:
554 0 : sRet = "my";
555 0 : break;
556 : case USCRIPT_OGHAM:
557 0 : sRet = "pgl";
558 0 : break;
559 : case USCRIPT_OLD_ITALIC:
560 0 : sRet = "osc";
561 0 : break;
562 : case USCRIPT_ORIYA:
563 0 : sRet = "or";
564 0 : break;
565 : case USCRIPT_RUNIC:
566 0 : sRet = "ang";
567 0 : break;
568 : case USCRIPT_SINHALA:
569 0 : sRet = "si";
570 0 : break;
571 : case USCRIPT_SYRIAC:
572 0 : sRet = "syr";
573 0 : break;
574 : case USCRIPT_TAMIL:
575 0 : sRet = "ta";
576 0 : break;
577 : case USCRIPT_TELUGU:
578 0 : sRet = "te";
579 0 : break;
580 : case USCRIPT_THAANA:
581 0 : sRet = "dv";
582 0 : break;
583 : case USCRIPT_THAI:
584 0 : sRet = "th";
585 0 : break;
586 : case USCRIPT_TIBETAN:
587 0 : sRet = "bo";
588 0 : break;
589 : case USCRIPT_CANADIAN_ABORIGINAL:
590 0 : sRet = "iu";
591 0 : break;
592 : case USCRIPT_YI:
593 0 : sRet = "ii";
594 0 : break;
595 : case USCRIPT_TAGALOG:
596 0 : sRet = "tl";
597 0 : break;
598 : case USCRIPT_HANUNOO:
599 0 : sRet = "hnn";
600 0 : break;
601 : case USCRIPT_BUHID:
602 0 : sRet = "bku";
603 0 : break;
604 : case USCRIPT_TAGBANWA:
605 0 : sRet = "tbw";
606 0 : break;
607 : case USCRIPT_BRAILLE:
608 0 : sRet = "en";
609 0 : break;
610 : case USCRIPT_CYPRIOT:
611 0 : sRet = "ecy";
612 0 : break;
613 : case USCRIPT_LIMBU:
614 0 : sRet = "lif";
615 0 : break;
616 : case USCRIPT_LINEAR_B:
617 0 : sRet = "gmy";
618 0 : break;
619 : case USCRIPT_OSMANYA:
620 0 : sRet = "so";
621 0 : break;
622 : case USCRIPT_SHAVIAN:
623 0 : sRet = "en";
624 0 : break;
625 : case USCRIPT_TAI_LE:
626 0 : sRet = "tdd";
627 0 : break;
628 : case USCRIPT_UGARITIC:
629 0 : sRet = "uga";
630 0 : break;
631 : case USCRIPT_KATAKANA_OR_HIRAGANA:
632 0 : sRet = "ja";
633 0 : break;
634 : case USCRIPT_BUGINESE:
635 0 : sRet = "bug";
636 0 : break;
637 : case USCRIPT_GLAGOLITIC:
638 0 : sRet = "ch";
639 0 : break;
640 : case USCRIPT_KHAROSHTHI:
641 0 : sRet = "pra";
642 0 : break;
643 : case USCRIPT_SYLOTI_NAGRI:
644 0 : sRet = "syl";
645 0 : break;
646 : case USCRIPT_NEW_TAI_LUE:
647 0 : sRet = "khb";
648 0 : break;
649 : case USCRIPT_TIFINAGH:
650 0 : sRet = "tmh";
651 0 : break;
652 : case USCRIPT_OLD_PERSIAN:
653 0 : sRet = "peo";
654 0 : break;
655 : case USCRIPT_BALINESE:
656 0 : sRet = "ban";
657 0 : break;
658 : case USCRIPT_BATAK:
659 0 : sRet = "btk";
660 0 : break;
661 : case USCRIPT_BLISSYMBOLS:
662 0 : sRet = "en";
663 0 : break;
664 : case USCRIPT_BRAHMI:
665 0 : sRet = "pra";
666 0 : break;
667 : case USCRIPT_CHAM:
668 0 : sRet = "cja";
669 0 : break;
670 : case USCRIPT_CIRTH:
671 0 : sRet = "sjn";
672 0 : break;
673 : case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
674 0 : sRet = "cu";
675 0 : break;
676 : case USCRIPT_DEMOTIC_EGYPTIAN:
677 : case USCRIPT_HIERATIC_EGYPTIAN:
678 : case USCRIPT_EGYPTIAN_HIEROGLYPHS:
679 0 : sRet = "egy";
680 0 : break;
681 : case USCRIPT_KHUTSURI:
682 0 : sRet = "ka";
683 0 : break;
684 : case USCRIPT_SIMPLIFIED_HAN:
685 0 : sRet = "zh";
686 0 : break;
687 : case USCRIPT_TRADITIONAL_HAN:
688 0 : sRet = "zh";
689 0 : break;
690 : case USCRIPT_PAHAWH_HMONG:
691 0 : sRet = "blu";
692 0 : break;
693 : case USCRIPT_OLD_HUNGARIAN:
694 0 : sRet = "ohu";
695 0 : break;
696 : case USCRIPT_HARAPPAN_INDUS:
697 0 : sRet = "xiv";
698 0 : break;
699 : case USCRIPT_JAVANESE:
700 0 : sRet = "kaw";
701 0 : break;
702 : case USCRIPT_KAYAH_LI:
703 0 : sRet = "eky";
704 0 : break;
705 : case USCRIPT_LATIN_FRAKTUR:
706 0 : sRet = "de";
707 0 : break;
708 : case USCRIPT_LATIN_GAELIC:
709 0 : sRet = "ga";
710 0 : break;
711 : case USCRIPT_LEPCHA:
712 0 : sRet = "lep";
713 0 : break;
714 : case USCRIPT_LINEAR_A:
715 0 : sRet = "ecr";
716 0 : break;
717 : case USCRIPT_MAYAN_HIEROGLYPHS:
718 0 : sRet = "myn";
719 0 : break;
720 : case USCRIPT_MEROITIC:
721 0 : sRet = "xmr";
722 0 : break;
723 : case USCRIPT_NKO:
724 0 : sRet = "nqo";
725 0 : break;
726 : case USCRIPT_ORKHON:
727 0 : sRet = "otk";
728 0 : break;
729 : case USCRIPT_OLD_PERMIC:
730 0 : sRet = "kv";
731 0 : break;
732 : case USCRIPT_PHAGS_PA:
733 0 : sRet = "xng";
734 0 : break;
735 : case USCRIPT_PHOENICIAN:
736 0 : sRet = "phn";
737 0 : break;
738 : case USCRIPT_PHONETIC_POLLARD:
739 0 : sRet = "hmd";
740 0 : break;
741 : case USCRIPT_RONGORONGO:
742 0 : sRet = "rap";
743 0 : break;
744 : case USCRIPT_SARATI:
745 0 : sRet = "qya";
746 0 : break;
747 : case USCRIPT_ESTRANGELO_SYRIAC:
748 0 : sRet = "syr";
749 0 : break;
750 : case USCRIPT_WESTERN_SYRIAC:
751 0 : sRet = "tru";
752 0 : break;
753 : case USCRIPT_EASTERN_SYRIAC:
754 0 : sRet = "aii";
755 0 : break;
756 : case USCRIPT_TENGWAR:
757 0 : sRet = "sjn";
758 0 : break;
759 : case USCRIPT_VAI:
760 0 : sRet = "vai";
761 0 : break;
762 : case USCRIPT_VISIBLE_SPEECH:
763 0 : sRet = "en";
764 0 : break;
765 : case USCRIPT_CUNEIFORM:
766 0 : sRet = "akk";
767 0 : break;
768 : case USCRIPT_CARIAN:
769 0 : sRet = "xcr";
770 0 : break;
771 : case USCRIPT_JAPANESE:
772 0 : sRet = "ja";
773 0 : break;
774 : case USCRIPT_LANNA:
775 0 : sRet = "nod";
776 0 : break;
777 : case USCRIPT_LYCIAN:
778 0 : sRet = "xlc";
779 0 : break;
780 : case USCRIPT_LYDIAN:
781 0 : sRet = "xld";
782 0 : break;
783 : case USCRIPT_OL_CHIKI:
784 0 : sRet = "sat";
785 0 : break;
786 : case USCRIPT_REJANG:
787 0 : sRet = "rej";
788 0 : break;
789 : case USCRIPT_SAURASHTRA:
790 0 : sRet = "saz";
791 0 : break;
792 : case USCRIPT_SIGN_WRITING:
793 0 : sRet = "en";
794 0 : break;
795 : case USCRIPT_SUNDANESE:
796 0 : sRet = "su";
797 0 : break;
798 : case USCRIPT_MOON:
799 0 : sRet = "en";
800 0 : break;
801 : case USCRIPT_MEITEI_MAYEK:
802 0 : sRet = "mni";
803 0 : break;
804 : case USCRIPT_IMPERIAL_ARAMAIC:
805 0 : sRet = "arc";
806 0 : break;
807 : case USCRIPT_AVESTAN:
808 0 : sRet = "ae";
809 0 : break;
810 : case USCRIPT_CHAKMA:
811 0 : sRet = "ccp";
812 0 : break;
813 : case USCRIPT_KOREAN:
814 0 : sRet = "ko";
815 0 : break;
816 : case USCRIPT_KAITHI:
817 0 : sRet = "awa";
818 0 : break;
819 : case USCRIPT_MANICHAEAN:
820 0 : sRet = "xmn";
821 0 : break;
822 : case USCRIPT_INSCRIPTIONAL_PAHLAVI:
823 : case USCRIPT_PSALTER_PAHLAVI:
824 : case USCRIPT_BOOK_PAHLAVI:
825 : case USCRIPT_INSCRIPTIONAL_PARTHIAN:
826 0 : sRet = "xpr";
827 0 : break;
828 : case USCRIPT_SAMARITAN:
829 0 : sRet = "heb";
830 0 : break;
831 : case USCRIPT_TAI_VIET:
832 0 : sRet = "blt";
833 0 : break;
834 : case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
835 0 : sRet = "mic";
836 0 : break;
837 : #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 4)
838 : case USCRIPT_NABATAEAN: //no language with an assigned code yet
839 0 : sRet = "mis";
840 0 : break;
841 : case USCRIPT_PALMYRENE: //no language with an assigned code yet
842 0 : sRet = "mis";
843 0 : break;
844 : case USCRIPT_BAMUM:
845 0 : sRet = "bax";
846 0 : break;
847 : case USCRIPT_LISU:
848 0 : sRet = "lis";
849 0 : break;
850 : case USCRIPT_NAKHI_GEBA:
851 0 : sRet = "nxq";
852 0 : break;
853 : case USCRIPT_OLD_SOUTH_ARABIAN:
854 0 : sRet = "xsa";
855 0 : break;
856 : case USCRIPT_BASSA_VAH:
857 0 : sRet = "bsq";
858 0 : break;
859 : case USCRIPT_DUPLOYAN_SHORTAND:
860 0 : sRet = "fr";
861 0 : break;
862 : case USCRIPT_ELBASAN:
863 0 : sRet = "sq";
864 0 : break;
865 : case USCRIPT_GRANTHA:
866 0 : sRet = "ta";
867 0 : break;
868 : case USCRIPT_KPELLE:
869 0 : sRet = "kpe";
870 0 : break;
871 : case USCRIPT_LOMA:
872 0 : sRet = "lom";
873 0 : break;
874 : case USCRIPT_MENDE:
875 0 : sRet = "men";
876 0 : break;
877 : case USCRIPT_MEROITIC_CURSIVE:
878 0 : sRet = "xmr";
879 0 : break;
880 : case USCRIPT_OLD_NORTH_ARABIAN:
881 0 : sRet = "xna";
882 0 : break;
883 : case USCRIPT_SINDHI:
884 0 : sRet = "sd";
885 0 : break;
886 : case USCRIPT_WARANG_CITI:
887 0 : sRet = "hoc";
888 0 : break;
889 : #endif
890 : #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
891 : case USCRIPT_AFAKA:
892 0 : sRet = "djk";
893 0 : break;
894 : case USCRIPT_JURCHEN:
895 0 : sRet = "juc";
896 0 : break;
897 : case USCRIPT_MRO:
898 0 : sRet = "cmr";
899 0 : break;
900 : case USCRIPT_NUSHU: //no language with an assigned code yet
901 0 : sRet = "mis";
902 0 : break;
903 : case USCRIPT_SHARADA:
904 0 : sRet = "sa";
905 0 : break;
906 : case USCRIPT_SORA_SOMPENG:
907 0 : sRet = "srb";
908 0 : break;
909 : case USCRIPT_TAKRI:
910 0 : sRet = "doi";
911 0 : break;
912 : case USCRIPT_TANGUT:
913 0 : sRet = "txg";
914 0 : break;
915 : case USCRIPT_WOLEAI:
916 0 : sRet = "woe";
917 0 : break;
918 : #endif
919 : #if (U_ICU_VERSION_MAJOR_NUM >= 49)
920 : case USCRIPT_ANATOLIAN_HIEROGLYPHS:
921 0 : sRet = "hlu";
922 0 : break;
923 : case USCRIPT_KHOJKI:
924 0 : sRet = "gu";
925 0 : break;
926 : case USCRIPT_TIRHUTA:
927 0 : sRet = "mai";
928 0 : break;
929 : #endif
930 : #if (U_ICU_VERSION_MAJOR_NUM >= 52)
931 : case USCRIPT_CAUCASIAN_ALBANIAN:
932 0 : sRet = "xag";
933 0 : break;
934 : case USCRIPT_MAHAJANI:
935 0 : sRet = "mwr";
936 0 : break;
937 : #endif
938 : #if (U_ICU_VERSION_MAJOR_NUM >= 54)
939 : case USCRIPT_AHOM:
940 0 : sRet = "aho";
941 0 : break;
942 : case USCRIPT_HATRAN:
943 0 : sRet = "qly-Hatr";
944 0 : break;
945 : case USCRIPT_MODI:
946 0 : sRet = "mr-Modi";
947 0 : break;
948 : case USCRIPT_MULTANI:
949 0 : sRet = "skr-Mutl";
950 0 : break;
951 : case USCRIPT_PAU_CIN_HAU:
952 0 : sRet = "ctd-Pauc";
953 0 : break;
954 : case USCRIPT_SIDDHAM:
955 0 : sRet = "sa-Sidd";
956 0 : break;
957 : #endif
958 : }
959 14 : return sRet;
960 : }
961 :
962 : //Format a number as a percentage according to the rules of the given
963 : //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
964 1017 : OUString SAL_CALL unicode::formatPercent(double dNumber,
965 : const LanguageTag &rLangTag)
966 : {
967 : // get a currency formatter for this locale ID
968 1017 : UErrorCode errorCode=U_ZERO_ERROR;
969 :
970 1017 : LanguageTag aLangTag(rLangTag);
971 :
972 : // As of CLDR Version 24 these languages were not listed as using spacing
973 : // between number and % but are reported as such by our l10n groups
974 : // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
975 : // so format using French which has the desired rules
976 1017 : if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
977 0 : aLangTag = LanguageTag("fr-FR");
978 :
979 2034 : icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
980 :
981 : boost::scoped_ptr<NumberFormat> xF(
982 2034 : NumberFormat::createPercentInstance(aLocale, errorCode));
983 1017 : if(U_FAILURE(errorCode))
984 : {
985 : SAL_WARN("i18n", "NumberFormat::createPercentInstance failed");
986 0 : return OUString::number(dNumber) + "%";
987 : }
988 :
989 2034 : UnicodeString output;
990 1017 : xF->format(dNumber/100, output);
991 1017 : OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
992 3051 : output.length());
993 1017 : if (rLangTag.getLanguage() == "de")
994 : {
995 : //narrow no-break space instead of (normal) no-break space
996 0 : return aRet.replace(0x00A0, 0x202F);
997 : }
998 2034 : return aRet;
999 : }
1000 :
1001 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|