Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include <boost/scoped_ptr.hpp>
21 : #include <com/sun/star/i18n/UnicodeType.hpp>
22 : #include <com/sun/star/i18n/KCharacterType.hpp>
23 : #include <com/sun/star/i18n/ScriptType.hpp>
24 : #include <i18nlangtag/languagetag.hxx>
25 : #include <i18nlangtag/languagetagicu.hxx>
26 : #include <i18nutil/unicode.hxx>
27 : #include <unicode/numfmt.h>
28 : #include "unicode_data.h"
29 :
30 : // Workaround for glibc braindamage:
31 : // glibc 2.4's langinfo.h does "#define CURRENCY_SYMBOL __CURRENCY_SYMBOL"
32 : // which (obviously) breaks UnicodeType::CURRENCY_SYMBOL
33 : #undef CURRENCY_SYMBOL
34 :
35 : using namespace ::com::sun::star::i18n;
36 :
37 : static const ScriptTypeList defaultTypeList[] = {
38 : { UnicodeScript_kBasicLatin,
39 : UnicodeScript_kBasicLatin,
40 : UnicodeScript_kBasicLatin }, // 0,
41 : { UnicodeScript_kLatin1Supplement,
42 : UnicodeScript_kLatin1Supplement,
43 : UnicodeScript_kLatin1Supplement },// 1,
44 : { UnicodeScript_kLatinExtendedA,
45 : UnicodeScript_kLatinExtendedA,
46 : UnicodeScript_kLatinExtendedA }, // 2,
47 : { UnicodeScript_kLatinExtendedB,
48 : UnicodeScript_kLatinExtendedB,
49 : UnicodeScript_kLatinExtendedB }, // 3,
50 : { UnicodeScript_kIPAExtension,
51 : UnicodeScript_kIPAExtension,
52 : UnicodeScript_kIPAExtension }, // 4,
53 : { UnicodeScript_kSpacingModifier,
54 : UnicodeScript_kSpacingModifier,
55 : UnicodeScript_kSpacingModifier }, // 5,
56 : { UnicodeScript_kCombiningDiacritical,
57 : UnicodeScript_kCombiningDiacritical,
58 : UnicodeScript_kCombiningDiacritical }, // 6,
59 : { UnicodeScript_kGreek,
60 : UnicodeScript_kGreek,
61 : UnicodeScript_kGreek }, // 7,
62 : { UnicodeScript_kCyrillic,
63 : UnicodeScript_kCyrillic,
64 : UnicodeScript_kCyrillic }, // 8,
65 : { UnicodeScript_kArmenian,
66 : UnicodeScript_kArmenian,
67 : UnicodeScript_kArmenian }, // 9,
68 : { UnicodeScript_kHebrew,
69 : UnicodeScript_kHebrew,
70 : UnicodeScript_kHebrew }, // 10,
71 : { UnicodeScript_kArabic,
72 : UnicodeScript_kArabic,
73 : UnicodeScript_kArabic }, // 11,
74 : { UnicodeScript_kSyriac,
75 : UnicodeScript_kSyriac,
76 : UnicodeScript_kSyriac }, // 12,
77 : { UnicodeScript_kThaana,
78 : UnicodeScript_kThaana,
79 : UnicodeScript_kThaana }, // 13,
80 : { UnicodeScript_kDevanagari,
81 : UnicodeScript_kDevanagari,
82 : UnicodeScript_kDevanagari }, // 14,
83 : { UnicodeScript_kBengali,
84 : UnicodeScript_kBengali,
85 : UnicodeScript_kBengali }, // 15,
86 : { UnicodeScript_kGurmukhi,
87 : UnicodeScript_kGurmukhi,
88 : UnicodeScript_kGurmukhi }, // 16,
89 : { UnicodeScript_kGujarati,
90 : UnicodeScript_kGujarati,
91 : UnicodeScript_kGujarati }, // 17,
92 : { UnicodeScript_kOriya,
93 : UnicodeScript_kOriya,
94 : UnicodeScript_kOriya }, // 18,
95 : { UnicodeScript_kTamil,
96 : UnicodeScript_kTamil,
97 : UnicodeScript_kTamil }, // 19,
98 : { UnicodeScript_kTelugu,
99 : UnicodeScript_kTelugu,
100 : UnicodeScript_kTelugu }, // 20,
101 : { UnicodeScript_kKannada,
102 : UnicodeScript_kKannada,
103 : UnicodeScript_kKannada }, // 21,
104 : { UnicodeScript_kMalayalam,
105 : UnicodeScript_kMalayalam,
106 : UnicodeScript_kMalayalam }, // 22,
107 : { UnicodeScript_kSinhala,
108 : UnicodeScript_kSinhala,
109 : UnicodeScript_kSinhala }, // 23,
110 : { UnicodeScript_kThai,
111 : UnicodeScript_kThai,
112 : UnicodeScript_kThai }, // 24,
113 : { UnicodeScript_kLao,
114 : UnicodeScript_kLao,
115 : UnicodeScript_kLao }, // 25,
116 : { UnicodeScript_kTibetan,
117 : UnicodeScript_kTibetan,
118 : UnicodeScript_kTibetan }, // 26,
119 : { UnicodeScript_kMyanmar,
120 : UnicodeScript_kMyanmar,
121 : UnicodeScript_kMyanmar }, // 27,
122 : { UnicodeScript_kGeorgian,
123 : UnicodeScript_kGeorgian,
124 : UnicodeScript_kGeorgian }, // 28,
125 : { UnicodeScript_kHangulJamo,
126 : UnicodeScript_kHangulJamo,
127 : UnicodeScript_kHangulJamo }, // 29,
128 : { UnicodeScript_kEthiopic,
129 : UnicodeScript_kEthiopic,
130 : UnicodeScript_kEthiopic }, // 30,
131 : { UnicodeScript_kCherokee,
132 : UnicodeScript_kCherokee,
133 : UnicodeScript_kCherokee }, // 31,
134 : { UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
135 : UnicodeScript_kUnifiedCanadianAboriginalSyllabics,
136 : UnicodeScript_kUnifiedCanadianAboriginalSyllabics }, // 32,
137 : { UnicodeScript_kOgham,
138 : UnicodeScript_kOgham,
139 : UnicodeScript_kOgham }, // 33,
140 : { UnicodeScript_kRunic,
141 : UnicodeScript_kRunic,
142 : UnicodeScript_kRunic }, // 34,
143 : { UnicodeScript_kKhmer,
144 : UnicodeScript_kKhmer,
145 : UnicodeScript_kKhmer }, // 35,
146 : { UnicodeScript_kMongolian,
147 : UnicodeScript_kMongolian,
148 : UnicodeScript_kMongolian }, // 36,
149 : { UnicodeScript_kLatinExtendedAdditional,
150 : UnicodeScript_kLatinExtendedAdditional,
151 : UnicodeScript_kLatinExtendedAdditional }, // 37,
152 : { UnicodeScript_kGreekExtended,
153 : UnicodeScript_kGreekExtended,
154 : UnicodeScript_kGreekExtended }, // 38,
155 : { UnicodeScript_kGeneralPunctuation,
156 : UnicodeScript_kGeneralPunctuation,
157 : UnicodeScript_kGeneralPunctuation }, // 39,
158 : { UnicodeScript_kSuperSubScript,
159 : UnicodeScript_kSuperSubScript,
160 : UnicodeScript_kSuperSubScript }, // 40,
161 : { UnicodeScript_kCurrencySymbolScript,
162 : UnicodeScript_kCurrencySymbolScript,
163 : UnicodeScript_kCurrencySymbolScript }, // 41,
164 : { UnicodeScript_kSymbolCombiningMark,
165 : UnicodeScript_kSymbolCombiningMark,
166 : UnicodeScript_kSymbolCombiningMark }, // 42,
167 : { UnicodeScript_kLetterlikeSymbol,
168 : UnicodeScript_kLetterlikeSymbol,
169 : UnicodeScript_kLetterlikeSymbol }, // 43,
170 : { UnicodeScript_kNumberForm,
171 : UnicodeScript_kNumberForm,
172 : UnicodeScript_kNumberForm }, // 44,
173 : { UnicodeScript_kArrow,
174 : UnicodeScript_kArrow,
175 : UnicodeScript_kArrow }, // 45,
176 : { UnicodeScript_kMathOperator,
177 : UnicodeScript_kMathOperator,
178 : UnicodeScript_kMathOperator }, // 46,
179 : { UnicodeScript_kMiscTechnical,
180 : UnicodeScript_kMiscTechnical,
181 : UnicodeScript_kMiscTechnical }, // 47,
182 : { UnicodeScript_kControlPicture,
183 : UnicodeScript_kControlPicture,
184 : UnicodeScript_kControlPicture }, // 48,
185 : { UnicodeScript_kOpticalCharacter,
186 : UnicodeScript_kOpticalCharacter,
187 : UnicodeScript_kOpticalCharacter }, // 49,
188 : { UnicodeScript_kEnclosedAlphanumeric,
189 : UnicodeScript_kEnclosedAlphanumeric,
190 : UnicodeScript_kEnclosedAlphanumeric }, // 50,
191 : { UnicodeScript_kBoxDrawing,
192 : UnicodeScript_kBoxDrawing,
193 : UnicodeScript_kBoxDrawing }, // 51,
194 : { UnicodeScript_kBlockElement,
195 : UnicodeScript_kBlockElement,
196 : UnicodeScript_kBlockElement }, // 52,
197 : { UnicodeScript_kGeometricShape,
198 : UnicodeScript_kGeometricShape,
199 : UnicodeScript_kGeometricShape }, // 53,
200 : { UnicodeScript_kMiscSymbol,
201 : UnicodeScript_kMiscSymbol,
202 : UnicodeScript_kMiscSymbol }, // 54,
203 : { UnicodeScript_kDingbat,
204 : UnicodeScript_kDingbat,
205 : UnicodeScript_kDingbat }, // 55,
206 : { UnicodeScript_kBraillePatterns,
207 : UnicodeScript_kBraillePatterns,
208 : UnicodeScript_kBraillePatterns }, // 56,
209 : { UnicodeScript_kCJKRadicalsSupplement,
210 : UnicodeScript_kCJKRadicalsSupplement,
211 : UnicodeScript_kCJKRadicalsSupplement }, // 57,
212 : { UnicodeScript_kKangxiRadicals,
213 : UnicodeScript_kKangxiRadicals,
214 : UnicodeScript_kKangxiRadicals }, // 58,
215 : { UnicodeScript_kIdeographicDescriptionCharacters,
216 : UnicodeScript_kIdeographicDescriptionCharacters,
217 : UnicodeScript_kIdeographicDescriptionCharacters }, // 59,
218 : { UnicodeScript_kCJKSymbolPunctuation,
219 : UnicodeScript_kCJKSymbolPunctuation,
220 : UnicodeScript_kCJKSymbolPunctuation }, // 60,
221 : { UnicodeScript_kHiragana,
222 : UnicodeScript_kHiragana,
223 : UnicodeScript_kHiragana }, // 61,
224 : { UnicodeScript_kKatakana,
225 : UnicodeScript_kKatakana,
226 : UnicodeScript_kKatakana }, // 62,
227 : { UnicodeScript_kBopomofo,
228 : UnicodeScript_kBopomofo,
229 : UnicodeScript_kBopomofo }, // 63,
230 : { UnicodeScript_kHangulCompatibilityJamo,
231 : UnicodeScript_kHangulCompatibilityJamo,
232 : UnicodeScript_kHangulCompatibilityJamo }, // 64,
233 : { UnicodeScript_kKanbun,
234 : UnicodeScript_kKanbun,
235 : UnicodeScript_kKanbun }, // 65,
236 : { UnicodeScript_kBopomofoExtended,
237 : UnicodeScript_kBopomofoExtended,
238 : UnicodeScript_kBopomofoExtended }, // 66,
239 : { UnicodeScript_kEnclosedCJKLetterMonth,
240 : UnicodeScript_kEnclosedCJKLetterMonth,
241 : UnicodeScript_kEnclosedCJKLetterMonth }, // 67,
242 : { UnicodeScript_kCJKCompatibility,
243 : UnicodeScript_kCJKCompatibility,
244 : UnicodeScript_kCJKCompatibility }, // 68,
245 : { UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
246 : UnicodeScript_k_CJKUnifiedIdeographsExtensionA,
247 : UnicodeScript_k_CJKUnifiedIdeographsExtensionA }, // 69,
248 : { UnicodeScript_kCJKUnifiedIdeograph,
249 : UnicodeScript_kCJKUnifiedIdeograph,
250 : UnicodeScript_kCJKUnifiedIdeograph }, // 70,
251 : { UnicodeScript_kYiSyllables,
252 : UnicodeScript_kYiSyllables,
253 : UnicodeScript_kYiSyllables }, // 71,
254 : { UnicodeScript_kYiRadicals,
255 : UnicodeScript_kYiRadicals,
256 : UnicodeScript_kYiRadicals }, // 72,
257 : { UnicodeScript_kHangulSyllable,
258 : UnicodeScript_kHangulSyllable,
259 : UnicodeScript_kHangulSyllable }, // 73,
260 : { UnicodeScript_kHighSurrogate,
261 : UnicodeScript_kHighSurrogate,
262 : UnicodeScript_kHighSurrogate }, // 74,
263 : { UnicodeScript_kHighPrivateUseSurrogate,
264 : UnicodeScript_kHighPrivateUseSurrogate,
265 : UnicodeScript_kHighPrivateUseSurrogate }, // 75,
266 : { UnicodeScript_kLowSurrogate,
267 : UnicodeScript_kLowSurrogate,
268 : UnicodeScript_kLowSurrogate }, // 76,
269 : { UnicodeScript_kPrivateUse,
270 : UnicodeScript_kPrivateUse,
271 : UnicodeScript_kPrivateUse }, // 77,
272 : { UnicodeScript_kCJKCompatibilityIdeograph,
273 : UnicodeScript_kCJKCompatibilityIdeograph,
274 : UnicodeScript_kCJKCompatibilityIdeograph }, // 78,
275 : { UnicodeScript_kAlphabeticPresentation,
276 : UnicodeScript_kAlphabeticPresentation,
277 : UnicodeScript_kAlphabeticPresentation }, // 79,
278 : { UnicodeScript_kArabicPresentationA,
279 : UnicodeScript_kArabicPresentationA,
280 : UnicodeScript_kArabicPresentationA }, // 80,
281 : { UnicodeScript_kCombiningHalfMark,
282 : UnicodeScript_kCombiningHalfMark,
283 : UnicodeScript_kCombiningHalfMark }, // 81,
284 : { UnicodeScript_kCJKCompatibilityForm,
285 : UnicodeScript_kCJKCompatibilityForm,
286 : UnicodeScript_kCJKCompatibilityForm }, // 82,
287 : { UnicodeScript_kSmallFormVariant,
288 : UnicodeScript_kSmallFormVariant,
289 : UnicodeScript_kSmallFormVariant }, // 83,
290 : { UnicodeScript_kArabicPresentationB,
291 : UnicodeScript_kArabicPresentationB,
292 : UnicodeScript_kArabicPresentationB }, // 84,
293 : { UnicodeScript_kNoScript,
294 : UnicodeScript_kNoScript,
295 : UnicodeScript_kNoScript }, // 85,
296 : { UnicodeScript_kHalfwidthFullwidthForm,
297 : UnicodeScript_kHalfwidthFullwidthForm,
298 : UnicodeScript_kHalfwidthFullwidthForm }, // 86,
299 : { UnicodeScript_kScriptCount,
300 : UnicodeScript_kScriptCount,
301 : UnicodeScript_kNoScript } // 87,
302 : };
303 :
304 : sal_Int16 SAL_CALL
305 78 : unicode::getUnicodeScriptType( const sal_Unicode ch, const ScriptTypeList* typeList, sal_Int16 unknownType ) {
306 :
307 78 : if (!typeList) {
308 0 : typeList = defaultTypeList;
309 0 : unknownType = UnicodeScript_kNoScript;
310 : }
311 :
312 78 : sal_Int16 i = 0, type = typeList[0].to;
313 156 : while (type < UnicodeScript_kScriptCount && ch > UnicodeScriptType[type][UnicodeScriptTypeTo]) {
314 0 : type = typeList[++i].to;
315 : }
316 :
317 78 : return (type < UnicodeScript_kScriptCount &&
318 78 : ch >= UnicodeScriptType[typeList[i].from][UnicodeScriptTypeFrom]) ?
319 138 : typeList[i].value : unknownType;
320 : }
321 :
322 : sal_Unicode SAL_CALL
323 20 : unicode::getUnicodeScriptStart( UnicodeScript type) {
324 20 : return UnicodeScriptType[type][UnicodeScriptTypeFrom];
325 : }
326 :
327 : sal_Unicode SAL_CALL
328 20 : unicode::getUnicodeScriptEnd( UnicodeScript type) {
329 20 : return UnicodeScriptType[type][UnicodeScriptTypeTo];
330 : }
331 :
332 : sal_Int16 SAL_CALL
333 237542 : unicode::getUnicodeType( const sal_Unicode ch ) {
334 : static sal_Unicode c = 0x00;
335 : static sal_Int16 r = 0x00;
336 :
337 237542 : if (ch == c) return r;
338 45558 : else c = ch;
339 :
340 45558 : sal_Int16 address = UnicodeTypeIndex[ch >> 8];
341 0 : return r = (sal_Int16)((address < UnicodeTypeNumberBlock) ? UnicodeTypeBlockValue[address] :
342 45558 : UnicodeTypeValue[((address - UnicodeTypeNumberBlock) << 8) + (ch & 0xff)]);
343 : }
344 :
345 : sal_uInt8 SAL_CALL
346 0 : unicode::getUnicodeDirection( const sal_Unicode ch ) {
347 : static sal_Unicode c = 0x00;
348 : static sal_uInt8 r = 0x00;
349 :
350 0 : if (ch == c) return r;
351 0 : else c = ch;
352 :
353 0 : sal_Int16 address = UnicodeDirectionIndex[ch >> 8];
354 0 : return r = ((address < UnicodeDirectionNumberBlock) ? UnicodeDirectionBlockValue[address] :
355 0 : UnicodeDirectionValue[((address - UnicodeDirectionNumberBlock) << 8) + (ch & 0xff)]);
356 :
357 : }
358 :
359 : #define bit(name) (1U << name)
360 :
361 : #define UPPERMASK bit(UnicodeType::UPPERCASE_LETTER)
362 :
363 : #define LOWERMASK bit(UnicodeType::LOWERCASE_LETTER)
364 :
365 : #define TITLEMASK bit(UnicodeType::TITLECASE_LETTER)
366 :
367 : #define ALPHAMASK UPPERMASK|LOWERMASK|TITLEMASK|\
368 : bit(UnicodeType::MODIFIER_LETTER)|\
369 : bit(UnicodeType::OTHER_LETTER)
370 :
371 : #define SPACEMASK bit(UnicodeType::SPACE_SEPARATOR)|\
372 : bit(UnicodeType::LINE_SEPARATOR)|\
373 : bit(UnicodeType::PARAGRAPH_SEPARATOR)
374 :
375 : #define CONTROLMASK bit(UnicodeType::CONTROL)|\
376 : bit(UnicodeType::FORMAT)|\
377 : bit(UnicodeType::LINE_SEPARATOR)|\
378 : bit(UnicodeType::PARAGRAPH_SEPARATOR)
379 :
380 : #define IsType(func, mask) \
381 : bool SAL_CALL func( const sal_Unicode ch) {\
382 : return (bit(getUnicodeType(ch)) & (mask)) != 0;\
383 : }
384 :
385 117983 : IsType(unicode::isControl, CONTROLMASK)
386 119495 : IsType(unicode::isAlpha, ALPHAMASK)
387 40 : IsType(unicode::isSpace, SPACEMASK)
388 :
389 : #define CONTROLSPACE bit(0x09)|bit(0x0a)|bit(0x0b)|bit(0x0c)|bit(0x0d)|\
390 : bit(0x1c)|bit(0x1d)|bit(0x1e)|bit(0x1f)
391 :
392 40 : bool SAL_CALL unicode::isWhiteSpace( const sal_Unicode ch) {
393 40 : return (ch != 0xa0 && isSpace(ch)) || (ch <= 0x1F && (bit(ch) & (CONTROLSPACE)));
394 : }
395 :
396 29256 : sal_Int16 SAL_CALL unicode::getScriptClassFromUScriptCode(UScriptCode eScript)
397 : {
398 : //See unicode/uscript.h
399 : static const sal_Int16 scriptTypes[] =
400 : {
401 : ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX,
402 : ScriptType::ASIAN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
403 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::LATIN,
404 : // 15
405 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN, ScriptType::COMPLEX,
406 : ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX,
407 : ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
408 : // 30
409 : ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
410 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
411 : ScriptType::LATIN, ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
412 : // 45
413 : ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX,
414 : ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN,
415 : ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
416 : // 60
417 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
418 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX,
419 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN, ScriptType::ASIAN,
420 : // 75
421 : ScriptType::COMPLEX, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
422 : ScriptType::LATIN, ScriptType::LATIN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
423 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
424 : // 90
425 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
426 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
427 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK, ScriptType::COMPLEX,
428 : // 105
429 : ScriptType::ASIAN, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
430 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
431 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::ASIAN,
432 : // 120
433 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
434 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::WEAK, ScriptType::WEAK,
435 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
436 : // 135
437 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
438 : ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX, ScriptType::COMPLEX,
439 : ScriptType::COMPLEX,
440 : ScriptType::WEAK
441 : };
442 :
443 : sal_Int16 nRet;
444 29256 : if (eScript < USCRIPT_COMMON)
445 0 : nRet = ScriptType::WEAK;
446 29256 : else if (static_cast<size_t>(eScript) >= SAL_N_ELEMENTS(scriptTypes))
447 0 : nRet = ScriptType::COMPLEX; // anything new is going to be pretty wild
448 : else
449 29256 : nRet = scriptTypes[eScript];
450 29256 : return nRet;
451 : }
452 :
453 0 : OString SAL_CALL unicode::getExemplarLanguageForUScriptCode(UScriptCode eScript)
454 : {
455 0 : OString sRet;
456 0 : switch (eScript)
457 : {
458 : case USCRIPT_CODE_LIMIT:
459 : case USCRIPT_INVALID_CODE:
460 0 : sRet = "zxx";
461 0 : break;
462 : case USCRIPT_COMMON:
463 : case USCRIPT_INHERITED:
464 0 : sRet = "und";
465 0 : break;
466 : case USCRIPT_MATHEMATICAL_NOTATION:
467 : case USCRIPT_SYMBOLS:
468 0 : sRet = "zxx";
469 0 : break;
470 : case USCRIPT_UNWRITTEN_LANGUAGES:
471 : case USCRIPT_UNKNOWN:
472 0 : sRet = "und";
473 0 : break;
474 : case USCRIPT_ARABIC:
475 0 : sRet = "ar";
476 0 : break;
477 : case USCRIPT_ARMENIAN:
478 0 : sRet = "hy";
479 0 : break;
480 : case USCRIPT_BENGALI:
481 0 : sRet = "bn";
482 0 : break;
483 : case USCRIPT_BOPOMOFO:
484 0 : sRet = "zh";
485 0 : break;
486 : case USCRIPT_CHEROKEE:
487 0 : sRet = "chr";
488 0 : break;
489 : case USCRIPT_COPTIC:
490 0 : sRet = "cop";
491 0 : break;
492 : case USCRIPT_CYRILLIC:
493 0 : sRet = "ru";
494 0 : break;
495 : case USCRIPT_DESERET:
496 0 : sRet = "en";
497 0 : break;
498 : case USCRIPT_DEVANAGARI:
499 0 : sRet = "hi";
500 0 : break;
501 : case USCRIPT_ETHIOPIC:
502 0 : sRet = "am";
503 0 : break;
504 : case USCRIPT_GEORGIAN:
505 0 : sRet = "ka";
506 0 : break;
507 : case USCRIPT_GOTHIC:
508 0 : sRet = "got";
509 0 : break;
510 : case USCRIPT_GREEK:
511 0 : sRet = "el";
512 0 : break;
513 : case USCRIPT_GUJARATI:
514 0 : sRet = "gu";
515 0 : break;
516 : case USCRIPT_GURMUKHI:
517 0 : sRet = "pa";
518 0 : break;
519 : case USCRIPT_HAN:
520 0 : sRet = "zh";
521 0 : break;
522 : case USCRIPT_HANGUL:
523 0 : sRet = "ko";
524 0 : break;
525 : case USCRIPT_HEBREW:
526 0 : sRet = "hr";
527 0 : break;
528 : case USCRIPT_HIRAGANA:
529 0 : sRet = "ja";
530 0 : break;
531 : case USCRIPT_KANNADA:
532 0 : sRet = "kn";
533 0 : break;
534 : case USCRIPT_KATAKANA:
535 0 : sRet = "ja";
536 0 : break;
537 : case USCRIPT_KHMER:
538 0 : sRet = "km";
539 0 : break;
540 : case USCRIPT_LAO:
541 0 : sRet = "lo";
542 0 : break;
543 : case USCRIPT_LATIN:
544 0 : sRet = "en";
545 0 : break;
546 : case USCRIPT_MALAYALAM:
547 0 : sRet = "ml";
548 0 : break;
549 : case USCRIPT_MONGOLIAN:
550 0 : sRet = "mn";
551 0 : break;
552 : case USCRIPT_MYANMAR:
553 0 : sRet = "my";
554 0 : break;
555 : case USCRIPT_OGHAM:
556 0 : sRet = "pgl";
557 0 : break;
558 : case USCRIPT_OLD_ITALIC:
559 0 : sRet = "osc";
560 0 : break;
561 : case USCRIPT_ORIYA:
562 0 : sRet = "or";
563 0 : break;
564 : case USCRIPT_RUNIC:
565 0 : sRet = "ang";
566 0 : break;
567 : case USCRIPT_SINHALA:
568 0 : sRet = "si";
569 0 : break;
570 : case USCRIPT_SYRIAC:
571 0 : sRet = "syr";
572 0 : break;
573 : case USCRIPT_TAMIL:
574 0 : sRet = "ta";
575 0 : break;
576 : case USCRIPT_TELUGU:
577 0 : sRet = "te";
578 0 : break;
579 : case USCRIPT_THAANA:
580 0 : sRet = "dv";
581 0 : break;
582 : case USCRIPT_THAI:
583 0 : sRet = "th";
584 0 : break;
585 : case USCRIPT_TIBETAN:
586 0 : sRet = "bo";
587 0 : break;
588 : case USCRIPT_CANADIAN_ABORIGINAL:
589 0 : sRet = "iu";
590 0 : break;
591 : case USCRIPT_YI:
592 0 : sRet = "ii";
593 0 : break;
594 : case USCRIPT_TAGALOG:
595 0 : sRet = "tl";
596 0 : break;
597 : case USCRIPT_HANUNOO:
598 0 : sRet = "hnn";
599 0 : break;
600 : case USCRIPT_BUHID:
601 0 : sRet = "bku";
602 0 : break;
603 : case USCRIPT_TAGBANWA:
604 0 : sRet = "tbw";
605 0 : break;
606 : case USCRIPT_BRAILLE:
607 0 : sRet = "en";
608 0 : break;
609 : case USCRIPT_CYPRIOT:
610 0 : sRet = "ecy";
611 0 : break;
612 : case USCRIPT_LIMBU:
613 0 : sRet = "lif";
614 0 : break;
615 : case USCRIPT_LINEAR_B:
616 0 : sRet = "gmy";
617 0 : break;
618 : case USCRIPT_OSMANYA:
619 0 : sRet = "so";
620 0 : break;
621 : case USCRIPT_SHAVIAN:
622 0 : sRet = "en";
623 0 : break;
624 : case USCRIPT_TAI_LE:
625 0 : sRet = "tdd";
626 0 : break;
627 : case USCRIPT_UGARITIC:
628 0 : sRet = "uga";
629 0 : break;
630 : case USCRIPT_KATAKANA_OR_HIRAGANA:
631 0 : sRet = "ja";
632 0 : break;
633 : case USCRIPT_BUGINESE:
634 0 : sRet = "bug";
635 0 : break;
636 : case USCRIPT_GLAGOLITIC:
637 0 : sRet = "ch";
638 0 : break;
639 : case USCRIPT_KHAROSHTHI:
640 0 : sRet = "pra";
641 0 : break;
642 : case USCRIPT_SYLOTI_NAGRI:
643 0 : sRet = "syl";
644 0 : break;
645 : case USCRIPT_NEW_TAI_LUE:
646 0 : sRet = "khb";
647 0 : break;
648 : case USCRIPT_TIFINAGH:
649 0 : sRet = "tmh";
650 0 : break;
651 : case USCRIPT_OLD_PERSIAN:
652 0 : sRet = "peo";
653 0 : break;
654 : case USCRIPT_BALINESE:
655 0 : sRet = "ban";
656 0 : break;
657 : case USCRIPT_BATAK:
658 0 : sRet = "btk";
659 0 : break;
660 : case USCRIPT_BLISSYMBOLS:
661 0 : sRet = "en";
662 0 : break;
663 : case USCRIPT_BRAHMI:
664 0 : sRet = "pra";
665 0 : break;
666 : case USCRIPT_CHAM:
667 0 : sRet = "cja";
668 0 : break;
669 : case USCRIPT_CIRTH:
670 0 : sRet = "sjn";
671 0 : break;
672 : case USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC:
673 0 : sRet = "cu";
674 0 : break;
675 : case USCRIPT_DEMOTIC_EGYPTIAN:
676 : case USCRIPT_HIERATIC_EGYPTIAN:
677 : case USCRIPT_EGYPTIAN_HIEROGLYPHS:
678 0 : sRet = "egy";
679 0 : break;
680 : case USCRIPT_KHUTSURI:
681 0 : sRet = "ka";
682 0 : break;
683 : case USCRIPT_SIMPLIFIED_HAN:
684 0 : sRet = "zh";
685 0 : break;
686 : case USCRIPT_TRADITIONAL_HAN:
687 0 : sRet = "zh";
688 0 : break;
689 : case USCRIPT_PAHAWH_HMONG:
690 0 : sRet = "blu";
691 0 : break;
692 : case USCRIPT_OLD_HUNGARIAN:
693 0 : sRet = "ohu";
694 0 : break;
695 : case USCRIPT_HARAPPAN_INDUS:
696 0 : sRet = "xiv";
697 0 : break;
698 : case USCRIPT_JAVANESE:
699 0 : sRet = "kaw";
700 0 : break;
701 : case USCRIPT_KAYAH_LI:
702 0 : sRet = "eky";
703 0 : break;
704 : case USCRIPT_LATIN_FRAKTUR:
705 0 : sRet = "de";
706 0 : break;
707 : case USCRIPT_LATIN_GAELIC:
708 0 : sRet = "ga";
709 0 : break;
710 : case USCRIPT_LEPCHA:
711 0 : sRet = "lep";
712 0 : break;
713 : case USCRIPT_LINEAR_A:
714 0 : sRet = "ecr";
715 0 : break;
716 : case USCRIPT_MAYAN_HIEROGLYPHS:
717 0 : sRet = "myn";
718 0 : break;
719 : case USCRIPT_MEROITIC:
720 0 : sRet = "xmr";
721 0 : break;
722 : case USCRIPT_NKO:
723 0 : sRet = "nqo";
724 0 : break;
725 : case USCRIPT_ORKHON:
726 0 : sRet = "otk";
727 0 : break;
728 : case USCRIPT_OLD_PERMIC:
729 0 : sRet = "kv";
730 0 : break;
731 : case USCRIPT_PHAGS_PA:
732 0 : sRet = "xng";
733 0 : break;
734 : case USCRIPT_PHOENICIAN:
735 0 : sRet = "phn";
736 0 : break;
737 : case USCRIPT_PHONETIC_POLLARD:
738 0 : sRet = "hmd";
739 0 : break;
740 : case USCRIPT_RONGORONGO:
741 0 : sRet = "rap";
742 0 : break;
743 : case USCRIPT_SARATI:
744 0 : sRet = "qya";
745 0 : break;
746 : case USCRIPT_ESTRANGELO_SYRIAC:
747 0 : sRet = "syr";
748 0 : break;
749 : case USCRIPT_WESTERN_SYRIAC:
750 0 : sRet = "tru";
751 0 : break;
752 : case USCRIPT_EASTERN_SYRIAC:
753 0 : sRet = "aii";
754 0 : break;
755 : case USCRIPT_TENGWAR:
756 0 : sRet = "sjn";
757 0 : break;
758 : case USCRIPT_VAI:
759 0 : sRet = "vai";
760 0 : break;
761 : case USCRIPT_VISIBLE_SPEECH:
762 0 : sRet = "en";
763 0 : break;
764 : case USCRIPT_CUNEIFORM:
765 0 : sRet = "akk";
766 0 : break;
767 : case USCRIPT_CARIAN:
768 0 : sRet = "xcr";
769 0 : break;
770 : case USCRIPT_JAPANESE:
771 0 : sRet = "ja";
772 0 : break;
773 : case USCRIPT_LANNA:
774 0 : sRet = "nod";
775 0 : break;
776 : case USCRIPT_LYCIAN:
777 0 : sRet = "xlc";
778 0 : break;
779 : case USCRIPT_LYDIAN:
780 0 : sRet = "xld";
781 0 : break;
782 : case USCRIPT_OL_CHIKI:
783 0 : sRet = "sat";
784 0 : break;
785 : case USCRIPT_REJANG:
786 0 : sRet = "rej";
787 0 : break;
788 : case USCRIPT_SAURASHTRA:
789 0 : sRet = "saz";
790 0 : break;
791 : case USCRIPT_SIGN_WRITING:
792 0 : sRet = "en";
793 0 : break;
794 : case USCRIPT_SUNDANESE:
795 0 : sRet = "su";
796 0 : break;
797 : case USCRIPT_MOON:
798 0 : sRet = "en";
799 0 : break;
800 : case USCRIPT_MEITEI_MAYEK:
801 0 : sRet = "mni";
802 0 : break;
803 : case USCRIPT_IMPERIAL_ARAMAIC:
804 0 : sRet = "arc";
805 0 : break;
806 : case USCRIPT_AVESTAN:
807 0 : sRet = "ae";
808 0 : break;
809 : case USCRIPT_CHAKMA:
810 0 : sRet = "ccp";
811 0 : break;
812 : case USCRIPT_KOREAN:
813 0 : sRet = "ko";
814 0 : break;
815 : case USCRIPT_KAITHI:
816 0 : sRet = "awa";
817 0 : break;
818 : case USCRIPT_MANICHAEAN:
819 0 : sRet = "xmn";
820 0 : break;
821 : case USCRIPT_INSCRIPTIONAL_PAHLAVI:
822 : case USCRIPT_PSALTER_PAHLAVI:
823 : case USCRIPT_BOOK_PAHLAVI:
824 : case USCRIPT_INSCRIPTIONAL_PARTHIAN:
825 0 : sRet = "xpr";
826 0 : break;
827 : case USCRIPT_SAMARITAN:
828 0 : sRet = "heb";
829 0 : break;
830 : case USCRIPT_TAI_VIET:
831 0 : sRet = "blt";
832 0 : break;
833 : case USCRIPT_MANDAEAN: /* Aliased to USCRIPT_MANDAIC in icu 4.6. */
834 0 : sRet = "mic";
835 0 : break;
836 : #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 4)
837 : case USCRIPT_NABATAEAN: //no language with an assigned code yet
838 0 : sRet = "mis";
839 0 : break;
840 : case USCRIPT_PALMYRENE: //no language with an assigned code yet
841 0 : sRet = "mis";
842 0 : break;
843 : case USCRIPT_BAMUM:
844 0 : sRet = "bax";
845 0 : break;
846 : case USCRIPT_LISU:
847 0 : sRet = "lis";
848 0 : break;
849 : case USCRIPT_NAKHI_GEBA:
850 0 : sRet = "nxq";
851 0 : break;
852 : case USCRIPT_OLD_SOUTH_ARABIAN:
853 0 : sRet = "xsa";
854 0 : break;
855 : case USCRIPT_BASSA_VAH:
856 0 : sRet = "bsq";
857 0 : break;
858 : case USCRIPT_DUPLOYAN_SHORTAND:
859 0 : sRet = "fr";
860 0 : break;
861 : case USCRIPT_ELBASAN:
862 0 : sRet = "sq";
863 0 : break;
864 : case USCRIPT_GRANTHA:
865 0 : sRet = "ta";
866 0 : break;
867 : case USCRIPT_KPELLE:
868 0 : sRet = "kpe";
869 0 : break;
870 : case USCRIPT_LOMA:
871 0 : sRet = "lom";
872 0 : break;
873 : case USCRIPT_MENDE:
874 0 : sRet = "men";
875 0 : break;
876 : case USCRIPT_MEROITIC_CURSIVE:
877 0 : sRet = "xmr";
878 0 : break;
879 : case USCRIPT_OLD_NORTH_ARABIAN:
880 0 : sRet = "xna";
881 0 : break;
882 : case USCRIPT_SINDHI:
883 0 : sRet = "sd";
884 0 : break;
885 : case USCRIPT_WARANG_CITI:
886 0 : sRet = "hoc";
887 0 : break;
888 : #endif
889 : #if (U_ICU_VERSION_MAJOR_NUM > 4) || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM >= 8)
890 : case USCRIPT_AFAKA:
891 0 : sRet = "djk";
892 0 : break;
893 : case USCRIPT_JURCHEN:
894 0 : sRet = "juc";
895 0 : break;
896 : case USCRIPT_MRO:
897 0 : sRet = "cmr";
898 0 : break;
899 : case USCRIPT_NUSHU: //no language with an assigned code yet
900 0 : sRet = "mis";
901 0 : break;
902 : case USCRIPT_SHARADA:
903 0 : sRet = "sa";
904 0 : break;
905 : case USCRIPT_SORA_SOMPENG:
906 0 : sRet = "srb";
907 0 : break;
908 : case USCRIPT_TAKRI:
909 0 : sRet = "doi";
910 0 : break;
911 : case USCRIPT_TANGUT:
912 0 : sRet = "txg";
913 0 : break;
914 : case USCRIPT_WOLEAI:
915 0 : sRet = "woe";
916 0 : break;
917 : #endif
918 : #if (U_ICU_VERSION_MAJOR_NUM >= 49)
919 : case USCRIPT_ANATOLIAN_HIEROGLYPHS:
920 0 : sRet = "hlu";
921 0 : break;
922 : case USCRIPT_KHOJKI:
923 0 : sRet = "gu";
924 0 : break;
925 : case USCRIPT_TIRHUTA:
926 0 : sRet = "mai";
927 0 : break;
928 : #endif
929 : #if (U_ICU_VERSION_MAJOR_NUM >= 52)
930 : case USCRIPT_CAUCASIAN_ALBANIAN:
931 0 : sRet = "xag";
932 0 : break;
933 : case USCRIPT_MAHAJANI:
934 0 : sRet = "mwr";
935 0 : break;
936 : #endif
937 : }
938 0 : return sRet;
939 : }
940 :
941 : //Format a number as a percentage according to the rules of the given
942 : //language, e.g. 100 -> "100%" for en-US vs "100 %" for de-DE
943 1684 : OUString SAL_CALL unicode::formatPercent(double dNumber,
944 : const LanguageTag &rLangTag)
945 : {
946 : // get a currency formatter for this locale ID
947 1684 : UErrorCode errorCode=U_ZERO_ERROR;
948 :
949 1684 : LanguageTag aLangTag(rLangTag);
950 :
951 : // As of CLDR Version 24 these languages were not listed as using spacing
952 : // between number and % but are reported as such by our l10n groups
953 : // http://www.unicode.org/cldr/charts/24/by_type/numbers.number_formatting_patterns.html
954 : // so format using French which has the desired rules
955 1684 : if (aLangTag.getLanguage() == "es" || aLangTag.getLanguage() == "sl")
956 0 : aLangTag = LanguageTag("fr-FR");
957 :
958 3368 : icu::Locale aLocale = LanguageTagIcu::getIcuLocale(aLangTag);
959 :
960 : boost::scoped_ptr<NumberFormat> xF(
961 3368 : NumberFormat::createPercentInstance(aLocale, errorCode));
962 1684 : if(U_FAILURE(errorCode))
963 : {
964 : SAL_WARN("i18n", "NumberFormat::createPercentInstance failed");
965 0 : return OUString::number(dNumber) + "%";
966 : }
967 :
968 3368 : UnicodeString output;
969 1684 : xF->format(dNumber/100, output);
970 1684 : OUString aRet(reinterpret_cast<const sal_Unicode *>(output.getBuffer()),
971 5052 : output.length());
972 1684 : if (rLangTag.getLanguage() == "de")
973 : {
974 : //narrow no-break space instead of (normal) no-break space
975 0 : return aRet.replace(0x00A0, 0x202F);
976 : }
977 3368 : return aRet;
978 : }
979 :
980 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|