Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include <breakiteratorImpl.hxx>
21 : #include <cppuhelper/supportsservice.hxx>
22 : #include <unicode/uchar.h>
23 : #include <i18nutil/unicode.hxx>
24 : #include <rtl/ustrbuf.hxx>
25 :
26 : using namespace ::com::sun::star::uno;
27 : using namespace ::com::sun::star::lang;
28 :
29 : namespace com { namespace sun { namespace star { namespace i18n {
30 :
31 26608 : BreakIteratorImpl::BreakIteratorImpl( const Reference < XComponentContext >& rxContext ) : m_xContext( rxContext )
32 : {
33 26608 : }
34 :
35 5086 : BreakIteratorImpl::BreakIteratorImpl()
36 : {
37 5086 : }
38 :
39 85424 : BreakIteratorImpl::~BreakIteratorImpl()
40 : {
41 : // Clear lookuptable
42 40392 : for (size_t l = 0; l < lookupTable.size(); l++)
43 10230 : delete lookupTable[l];
44 30162 : lookupTable.clear();
45 55262 : }
46 :
47 : #define LBI getLocaleSpecificBreakIterator(rLocale)
48 :
49 8604646 : sal_Int32 SAL_CALL BreakIteratorImpl::nextCharacters( const OUString& Text, sal_Int32 nStartPos,
50 : const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
51 : throw(RuntimeException, std::exception)
52 : {
53 8604646 : if (nCount < 0) throw RuntimeException();
54 :
55 8604646 : return LBI->nextCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
56 : }
57 :
58 98900 : sal_Int32 SAL_CALL BreakIteratorImpl::previousCharacters( const OUString& Text, sal_Int32 nStartPos,
59 : const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
60 : throw(RuntimeException, std::exception)
61 : {
62 98900 : if (nCount < 0) throw RuntimeException();
63 :
64 98900 : return LBI->previousCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
65 : }
66 :
67 : #define isZWSP(c) (ch == 0x200B)
68 :
69 268525540 : static sal_Int32 skipSpace(const OUString& Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, bool bDirection)
70 : {
71 268525540 : sal_uInt32 ch=0;
72 268525540 : sal_Int32 pos=nPos;
73 268525540 : switch (rWordType) {
74 : case WordType::ANYWORD_IGNOREWHITESPACES:
75 1055 : if (bDirection)
76 530 : while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos;
77 : else
78 525 : while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos;
79 1055 : break;
80 : case WordType::DICTIONARY_WORD:
81 267963207 : if (bDirection)
82 342160102 : while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch) ||
83 177038910 : ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos;
84 : else
85 381558694 : while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch) ||
86 156161502 : ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos;
87 267963207 : break;
88 : case WordType::WORD_COUNT:
89 516836 : if (bDirection)
90 258418 : while (nPos < len && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos;
91 : else
92 258418 : while (nPos > 0 && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos;
93 516836 : break;
94 : }
95 268525540 : return nPos;
96 : }
97 :
98 2862 : Boundary SAL_CALL BreakIteratorImpl::nextWord( const OUString& Text, sal_Int32 nStartPos,
99 : const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException, std::exception)
100 : {
101 2862 : sal_Int32 len = Text.getLength();
102 2862 : if( nStartPos < 0 || len == 0 )
103 61 : result.endPos = result.startPos = 0;
104 2801 : else if (nStartPos >= len)
105 2 : result.endPos = result.startPos = len;
106 : else {
107 2799 : result = LBI->nextWord(Text, nStartPos, rLocale, rWordType);
108 :
109 2799 : nStartPos = skipSpace(Text, result.startPos, len, rWordType, true);
110 :
111 2799 : if ( nStartPos != result.startPos) {
112 54 : if( nStartPos >= len )
113 2 : result.startPos = result.endPos = len;
114 : else {
115 52 : result = LBI->getWordBoundary(Text, nStartPos, rLocale, rWordType, sal_True);
116 : // i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts
117 52 : if (result.startPos < nStartPos) result.startPos = nStartPos;
118 : }
119 : }
120 : }
121 2862 : return result;
122 : }
123 :
124 2 : static inline bool SAL_CALL isCJK( const Locale& rLocale ) {
125 2 : return rLocale.Language == "zh" || rLocale.Language == "ja" || rLocale.Language == "ko";
126 : }
127 :
128 429 : Boundary SAL_CALL BreakIteratorImpl::previousWord( const OUString& Text, sal_Int32 nStartPos,
129 : const Locale& rLocale, sal_Int16 rWordType) throw(RuntimeException, std::exception)
130 : {
131 429 : sal_Int32 len = Text.getLength();
132 429 : if( nStartPos <= 0 || len == 0 ) {
133 0 : result.endPos = result.startPos = 0;
134 0 : return result;
135 429 : } else if (nStartPos > len) {
136 0 : result.endPos = result.startPos = len;
137 0 : return result;
138 : }
139 :
140 429 : sal_Int32 nPos = skipSpace(Text, nStartPos, len, rWordType, false);
141 :
142 : // if some spaces are skipped, and the script type is Asian with no CJK rLocale, we have to return
143 : // (nStartPos, -1) for caller to send correct rLocale for loading correct dictionary.
144 429 : result.startPos = nPos;
145 429 : if (nPos != nStartPos && nPos > 0 && !isCJK(rLocale) && getScriptClass(Text.iterateCodePoints(&nPos, -1)) == ScriptType::ASIAN) {
146 0 : result.endPos = -1;
147 0 : return result;
148 : }
149 :
150 429 : return LBI->previousWord(Text, result.startPos, rLocale, rWordType);
151 : }
152 :
153 :
154 134305294 : Boundary SAL_CALL BreakIteratorImpl::getWordBoundary( const OUString& Text, sal_Int32 nPos, const Locale& rLocale,
155 : sal_Int16 rWordType, sal_Bool bDirection ) throw(RuntimeException, std::exception)
156 : {
157 134305294 : sal_Int32 len = Text.getLength();
158 134305294 : if( nPos < 0 || len == 0 )
159 44236 : result.endPos = result.startPos = 0;
160 134261058 : else if (nPos > len)
161 0 : result.endPos = result.startPos = len;
162 : else {
163 : sal_Int32 next, prev;
164 134261058 : next = skipSpace(Text, nPos, len, rWordType, true);
165 134261058 : prev = skipSpace(Text, nPos, len, rWordType, false);
166 134261058 : if (prev == 0 && next == len) {
167 387 : result.endPos = result.startPos = nPos;
168 134260671 : } else if (prev == 0 && ! bDirection) {
169 0 : result.endPos = result.startPos = 0;
170 134260671 : } else if (next == len && bDirection) {
171 20053 : result.endPos = result.startPos = len;
172 : } else {
173 134240618 : if (next != prev) {
174 93927745 : if (next == nPos && next != len)
175 56832543 : bDirection = sal_True;
176 37095202 : else if (prev == nPos && prev != 0)
177 36707572 : bDirection = sal_False;
178 : else
179 387630 : nPos = bDirection ? next : prev;
180 : }
181 134240618 : result = LBI->getWordBoundary(Text, nPos, rLocale, rWordType, bDirection);
182 : }
183 : }
184 134305294 : return result;
185 : }
186 :
187 98 : sal_Bool SAL_CALL BreakIteratorImpl::isBeginWord( const OUString& Text, sal_Int32 nPos,
188 : const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException, std::exception)
189 : {
190 98 : sal_Int32 len = Text.getLength();
191 :
192 98 : if (nPos < 0 || nPos >= len) return sal_False;
193 :
194 98 : sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, true);
195 :
196 98 : if (tmp != nPos) return sal_False;
197 :
198 93 : result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_True);
199 :
200 93 : return result.startPos == nPos;
201 : }
202 :
203 100 : sal_Bool SAL_CALL BreakIteratorImpl::isEndWord( const OUString& Text, sal_Int32 nPos,
204 : const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException, std::exception)
205 : {
206 100 : sal_Int32 len = Text.getLength();
207 :
208 100 : if (nPos <= 0 || nPos > len) return sal_False;
209 :
210 98 : sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, false);
211 :
212 98 : if (tmp != nPos) return sal_False;
213 :
214 94 : result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_False);
215 :
216 94 : return result.endPos == nPos;
217 : }
218 :
219 8 : sal_Int32 SAL_CALL BreakIteratorImpl::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
220 : const Locale &rLocale ) throw(RuntimeException, std::exception)
221 : {
222 8 : if (nStartPos < 0 || nStartPos > Text.getLength())
223 0 : return -1;
224 8 : if (Text.isEmpty()) return 0;
225 8 : return LBI->beginOfSentence(Text, nStartPos, rLocale);
226 : }
227 :
228 689 : sal_Int32 SAL_CALL BreakIteratorImpl::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
229 : const Locale &rLocale ) throw(RuntimeException, std::exception)
230 : {
231 689 : if (nStartPos < 0 || nStartPos > Text.getLength())
232 0 : return -1;
233 689 : if (Text.isEmpty()) return 0;
234 689 : return LBI->endOfSentence(Text, nStartPos, rLocale);
235 : }
236 :
237 42527 : LineBreakResults SAL_CALL BreakIteratorImpl::getLineBreak( const OUString& Text, sal_Int32 nStartPos,
238 : const Locale& rLocale, sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions,
239 : const LineBreakUserOptions& bOptions ) throw(RuntimeException, std::exception)
240 : {
241 42527 : return LBI->getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions);
242 : }
243 :
244 316312301 : sal_Int16 SAL_CALL BreakIteratorImpl::getScriptType( const OUString& Text, sal_Int32 nPos )
245 : throw(RuntimeException, std::exception)
246 : {
247 632624528 : return (nPos < 0 || nPos >= Text.getLength()) ? ScriptType::WEAK :
248 632590444 : getScriptClass(Text.iterateCodePoints(&nPos, 0));
249 : }
250 :
251 :
252 : /** Increments/decrements position first, then obtains character.
253 : @return current position, may be -1 or text length if string was consumed.
254 : */
255 817652214 : static sal_Int32 SAL_CALL iterateCodePoints(const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) {
256 817652214 : sal_Int32 nLen = Text.getLength();
257 817652214 : if (nStartPos + inc < 0 || nStartPos + inc >= nLen) {
258 85729559 : ch = 0;
259 85729559 : nStartPos = nStartPos + inc < 0 ? -1 : nLen;
260 : } else {
261 731922655 : ch = Text.iterateCodePoints(&nStartPos, inc);
262 : // Fix for #i80436#.
263 : // erAck: 2009-06-30T21:52+0200 This logic looks somewhat
264 : // suspicious as if it cures a symptom.. anyway, had to add
265 : // nStartPos < Text.getLength() to silence the (correct) assertion
266 : // in rtl_uString_iterateCodePoints() if Text was one character
267 : // (codepoint) only, made up of a surrogate pair.
268 : //if (inc > 0 && nStartPos < Text.getLength())
269 : // ch = Text.iterateCodePoints(&nStartPos, 0);
270 : // With surrogates, nStartPos may actually point behind string
271 : // now, even if inc is only +1
272 731922655 : if (inc > 0)
273 695096438 : ch = (nStartPos < nLen ? Text.iterateCodePoints(&nStartPos, 0) : 0);
274 : }
275 817652214 : return nStartPos;
276 : }
277 :
278 :
279 36824427 : sal_Int32 SAL_CALL BreakIteratorImpl::beginOfScript( const OUString& Text,
280 : sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException, std::exception)
281 : {
282 36824427 : if (nStartPos < 0 || nStartPos >= Text.getLength())
283 2 : return -1;
284 :
285 36824425 : if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
286 0 : return -1;
287 :
288 36824425 : if (nStartPos == 0) return 0;
289 36824425 : sal_uInt32 ch=0;
290 73650642 : while (iterateCodePoints(Text, nStartPos, -1, ch) >= 0 && ScriptType == getScriptClass(ch)) {
291 2618 : if (nStartPos == 0) return 0;
292 : }
293 :
294 36823599 : return iterateCodePoints(Text, nStartPos, 1, ch);
295 : }
296 :
297 86031563 : sal_Int32 SAL_CALL BreakIteratorImpl::endOfScript( const OUString& Text,
298 : sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException, std::exception)
299 : {
300 86031563 : if (nStartPos < 0 || nStartPos >= Text.getLength())
301 11552 : return -1;
302 :
303 86020011 : if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
304 0 : return -1;
305 :
306 86020011 : sal_Int32 strLen = Text.getLength();
307 86020011 : sal_uInt32 ch=0;
308 830004651 : while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen ) {
309 658256145 : sal_Int16 currentCharScriptType = getScriptClass(ch);
310 658256145 : if(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)
311 291516 : break;
312 : }
313 86020011 : return nStartPos;
314 : }
315 :
316 0 : sal_Int32 SAL_CALL BreakIteratorImpl::previousScript( const OUString& Text,
317 : sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException, std::exception)
318 : {
319 0 : if (nStartPos < 0)
320 0 : return -1;
321 0 : if (nStartPos > Text.getLength())
322 0 : nStartPos = Text.getLength();
323 :
324 0 : sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
325 :
326 0 : sal_uInt32 ch=0;
327 0 : while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
328 0 : if ((((numberOfChange % 2) == 0) != (ScriptType != getScriptClass(ch))))
329 0 : numberOfChange--;
330 0 : else if (nStartPos == 0) {
331 0 : return -1;
332 : }
333 : }
334 0 : return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
335 : }
336 :
337 0 : sal_Int32 SAL_CALL BreakIteratorImpl::nextScript( const OUString& Text, sal_Int32 nStartPos,
338 : sal_Int16 ScriptType ) throw(RuntimeException, std::exception)
339 :
340 : {
341 0 : if (nStartPos < 0)
342 0 : nStartPos = 0;
343 0 : sal_Int32 strLen = Text.getLength();
344 0 : if (nStartPos >= strLen)
345 0 : return -1;
346 :
347 0 : sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
348 :
349 0 : sal_uInt32 ch=0;
350 0 : while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
351 0 : sal_Int16 currentCharScriptType = getScriptClass(ch);
352 0 : if ((numberOfChange == 1) ? (ScriptType == currentCharScriptType) :
353 0 : (ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK))
354 0 : numberOfChange--;
355 : }
356 0 : return numberOfChange == 0 ? nStartPos : -1;
357 : }
358 :
359 0 : sal_Int32 SAL_CALL BreakIteratorImpl::beginOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
360 : const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException, std::exception)
361 : {
362 0 : if (CharType == CharType::ANY_CHAR) return 0;
363 0 : if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
364 0 : if (CharType != (sal_Int16)u_charType( Text.iterateCodePoints(&nStartPos, 0))) return -1;
365 :
366 0 : sal_Int32 nPos=nStartPos;
367 0 : while(nStartPos > 0 && CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nPos, -1))) { nStartPos=nPos; }
368 0 : return nStartPos; // begin of char block is inclusive
369 : }
370 :
371 1640 : sal_Int32 SAL_CALL BreakIteratorImpl::endOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
372 : const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException, std::exception)
373 : {
374 1640 : sal_Int32 strLen = Text.getLength();
375 :
376 1640 : if (CharType == CharType::ANY_CHAR) return strLen; // end of char block is exclusive
377 1640 : if (nStartPos < 0 || nStartPos >= strLen) return -1;
378 1557 : if (CharType != (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) return -1;
379 :
380 1187 : sal_uInt32 ch=0;
381 1187 : while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen && CharType == (sal_Int16)u_charType(ch)) {}
382 1187 : return nStartPos; // end of char block is exclusive
383 : }
384 :
385 1885 : sal_Int32 SAL_CALL BreakIteratorImpl::nextCharBlock( const OUString& Text, sal_Int32 nStartPos,
386 : const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException, std::exception)
387 : {
388 1885 : if (CharType == CharType::ANY_CHAR) return -1;
389 1885 : if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
390 :
391 1854 : sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
392 1854 : sal_Int32 strLen = Text.getLength();
393 :
394 1854 : sal_uInt32 ch=0;
395 15409 : while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
396 11701 : if ((CharType != (sal_Int16)u_charType(ch)) != (numberOfChange == 1))
397 833 : numberOfChange--;
398 : }
399 1854 : return numberOfChange == 0 ? nStartPos : -1;
400 : }
401 :
402 0 : sal_Int32 SAL_CALL BreakIteratorImpl::previousCharBlock( const OUString& Text, sal_Int32 nStartPos,
403 : const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException, std::exception)
404 : {
405 0 : if(CharType == CharType::ANY_CHAR) return -1;
406 0 : if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
407 :
408 0 : sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
409 :
410 0 : sal_uInt32 ch=0;
411 0 : while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
412 0 : if (((numberOfChange % 2) == 0) != (CharType != (sal_Int16)u_charType(ch)))
413 0 : numberOfChange--;
414 0 : if (nStartPos == 0 && numberOfChange > 0) {
415 0 : numberOfChange--;
416 0 : if (numberOfChange == 0) return nStartPos;
417 : }
418 : }
419 0 : return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
420 : }
421 :
422 :
423 :
424 12606 : sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/,
425 : sal_Int32 /*nPos*/, const Locale& /*rLocale*/ ) throw(RuntimeException, std::exception)
426 : {
427 12606 : return 0;
428 : }
429 :
430 : namespace
431 : {
432 284417 : sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar)
433 : {
434 284417 : int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT);
435 284417 : return unicode::getScriptClassFromUScriptCode(static_cast<UScriptCode>(script));
436 : }
437 :
438 : struct UBlock2Script
439 : {
440 : UBlockCode from;
441 : UBlockCode to;
442 : sal_Int16 script;
443 : };
444 :
445 : static const UBlock2Script scriptList[] =
446 : {
447 : {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
448 : {UBLOCK_BASIC_LATIN, UBLOCK_ARMENIAN, ScriptType::LATIN},
449 : {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
450 : {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
451 : {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
452 : {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
453 : {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
454 : {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
455 : {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
456 : {UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK},
457 : {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
458 : {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
459 : {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
460 : {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
461 : {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
462 : {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
463 : {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
464 : {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
465 : {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
466 : };
467 :
468 : #define scriptListCount SAL_N_ELEMENTS(scriptList)
469 :
470 : //always sets rScriptType
471 :
472 : //returns true for characters historically explicitly assigned to
473 : //latin/weak/asian
474 :
475 : //returns false for characters that historically implicitly assigned to
476 : //weak as unknown
477 786290303 : bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16 &rScriptType)
478 : {
479 786290303 : bool bKnown = true;
480 : //handle specific characters always as weak:
481 : // 0x01 - this breaks a word
482 : // 0x02 - this can be inside a word
483 : // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char.
484 786290303 : if( 0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
485 43325775 : rScriptType = ScriptType::WEAK;
486 : // workaround for Coptic
487 742964528 : else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
488 0 : rScriptType = ScriptType::LATIN;
489 : else
490 : {
491 742964528 : UBlockCode block=ublock_getCode(currentChar);
492 742964528 : size_t i = 0;
493 2232865413 : while (i < scriptListCount)
494 : {
495 1489900885 : if (block <= scriptList[i].to)
496 742964528 : break;
497 746936357 : ++i;
498 : }
499 742964528 : if (i < scriptListCount && block >= scriptList[i].from)
500 742680111 : rScriptType = scriptList[i].script;
501 : else
502 : {
503 284417 : rScriptType = ScriptType::WEAK;
504 284417 : bKnown = false;
505 : }
506 : }
507 786290303 : return bKnown;
508 : }
509 : }
510 :
511 1134204943 : sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
512 : {
513 : static sal_uInt32 lastChar = 0;
514 : static sal_Int16 nRet = 0;
515 :
516 1134204943 : if (currentChar != lastChar)
517 : {
518 786290303 : lastChar = currentChar;
519 :
520 786290303 : if (!getCompatibilityScriptClassByBlock(currentChar, nRet))
521 284417 : nRet = getScriptClassByUAX24Script(currentChar);
522 : }
523 :
524 1134204943 : return nRet;
525 : }
526 :
527 144283806 : static inline bool operator == (const Locale& l1, const Locale& l2) {
528 144283806 : return l1.Language == l2.Language && l1.Country == l2.Country && l1.Variant == l2.Variant;
529 : }
530 :
531 14383 : bool SAL_CALL BreakIteratorImpl::createLocaleSpecificBreakIterator(const OUString& aLocaleName) throw( RuntimeException )
532 : {
533 : // to share service between same Language but different Country code, like zh_CN and zh_TW
534 15047 : for (size_t l = 0; l < lookupTable.size(); l++) {
535 788 : lookupTableItem *listItem = lookupTable[l];
536 788 : if (aLocaleName == listItem->aLocale.Language) {
537 124 : xBI = listItem->xBI;
538 124 : return true;
539 : }
540 : }
541 :
542 28518 : Reference < uno::XInterface > xI = m_xContext->getServiceManager()->createInstanceWithContext(
543 14259 : "com.sun.star.i18n.BreakIterator_" + aLocaleName, m_xContext);
544 :
545 14259 : if ( xI.is() ) {
546 5086 : xBI.set(xI, UNO_QUERY);
547 5086 : if (xBI.is()) {
548 5086 : lookupTable.push_back(new lookupTableItem(Locale(aLocaleName, aLocaleName, aLocaleName), xBI));
549 5086 : return true;
550 : }
551 : }
552 9173 : return false;
553 : }
554 :
555 : Reference < XBreakIterator > SAL_CALL
556 142990668 : BreakIteratorImpl::getLocaleSpecificBreakIterator(const Locale& rLocale) throw (RuntimeException)
557 : {
558 142990668 : if (xBI.is() && rLocale == aLocale)
559 142468461 : return xBI;
560 522207 : else if (m_xContext.is()) {
561 522207 : aLocale = rLocale;
562 :
563 1303430 : for (size_t i = 0; i < lookupTable.size(); i++) {
564 1298220 : lookupTableItem *listItem = lookupTable[i];
565 1298220 : if (rLocale == listItem->aLocale)
566 1039204 : return xBI = listItem->xBI;
567 : }
568 :
569 5210 : sal_Unicode under = (sal_Unicode)'_';
570 :
571 5210 : sal_Int32 l = rLocale.Language.getLength();
572 5210 : sal_Int32 c = rLocale.Country.getLength();
573 5210 : sal_Int32 v = rLocale.Variant.getLength();
574 5210 : OUStringBuffer aBuf(l+c+v+3);
575 :
576 20813 : if ((l > 0 && c > 0 && v > 0 &&
577 : // load service with name <base>_<lang>_<country>_<varian>
578 0 : createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append(
579 10420 : rLocale.Country).append(under).append(rLocale.Variant).makeStringAndClear())) ||
580 15517 : (l > 0 && c > 0 &&
581 : // load service with name <base>_<lang>_<country>
582 5167 : createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append(
583 25380 : rLocale.Country).makeStringAndClear())) ||
584 4702 : (l > 0 && c > 0 && rLocale.Language == "zh" &&
585 120 : (rLocale.Country == "HK" ||
586 60 : rLocale.Country == "MO" ) &&
587 : // if the country code is HK or MO, one more step to try TW.
588 0 : createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).appendAscii(
589 9879 : "TW").makeStringAndClear())) ||
590 4642 : (l > 0 &&
591 : // load service with name <base>_<lang>
592 19000 : createLocaleSpecificBreakIterator(rLocale.Language)) ||
593 : // load default service with name <base>_Unicode
594 14358 : createLocaleSpecificBreakIterator(OUString("Unicode"))) {
595 5210 : lookupTable.push_back( new lookupTableItem(aLocale, xBI) );
596 5210 : return xBI;
597 0 : }
598 : }
599 0 : throw RuntimeException();
600 : }
601 :
602 : OUString SAL_CALL
603 1 : BreakIteratorImpl::getImplementationName() throw( RuntimeException, std::exception )
604 : {
605 1 : return OUString("com.sun.star.i18n.BreakIterator");
606 : }
607 :
608 : sal_Bool SAL_CALL
609 0 : BreakIteratorImpl::supportsService(const OUString& rServiceName) throw( RuntimeException, std::exception )
610 : {
611 0 : return cppu::supportsService(this, rServiceName);
612 : }
613 :
614 : Sequence< OUString > SAL_CALL
615 1 : BreakIteratorImpl::getSupportedServiceNames() throw( RuntimeException, std::exception )
616 : {
617 1 : Sequence< OUString > aRet(1);
618 1 : aRet[0] = "com.sun.star.i18n.BreakIterator";
619 1 : return aRet;
620 : }
621 :
622 : } } } }
623 :
624 : extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface * SAL_CALL
625 26602 : com_sun_star_i18n_BreakIterator_get_implementation(
626 : css::uno::XComponentContext *context,
627 : css::uno::Sequence<css::uno::Any> const &)
628 : {
629 26602 : return cppu::acquire(new com::sun::star::i18n::BreakIteratorImpl(context));
630 : }
631 :
632 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|