Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include <breakiteratorImpl.hxx>
21 : #include <cppuhelper/supportsservice.hxx>
22 : #include <unicode/uchar.h>
23 : #include <i18nutil/unicode.hxx>
24 : #include <rtl/ustrbuf.hxx>
25 :
26 : using namespace ::com::sun::star::uno;
27 : using namespace ::com::sun::star::lang;
28 : using namespace ::rtl;
29 :
30 : namespace com { namespace sun { namespace star { namespace i18n {
31 :
32 0 : BreakIteratorImpl::BreakIteratorImpl( const Reference < XComponentContext >& rxContext ) : m_xContext( rxContext )
33 : {
34 0 : }
35 :
36 0 : BreakIteratorImpl::BreakIteratorImpl()
37 : {
38 0 : }
39 :
40 0 : BreakIteratorImpl::~BreakIteratorImpl()
41 : {
42 : // Clear lookuptable
43 0 : for (size_t l = 0; l < lookupTable.size(); l++)
44 0 : delete lookupTable[l];
45 0 : lookupTable.clear();
46 0 : }
47 :
48 : #define LBI getLocaleSpecificBreakIterator(rLocale)
49 :
50 0 : sal_Int32 SAL_CALL BreakIteratorImpl::nextCharacters( const OUString& Text, sal_Int32 nStartPos,
51 : const Locale &rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
52 : throw(RuntimeException, std::exception)
53 : {
54 0 : if (nCount < 0) throw RuntimeException();
55 :
56 0 : return LBI->nextCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
57 : }
58 :
59 0 : sal_Int32 SAL_CALL BreakIteratorImpl::previousCharacters( const OUString& Text, sal_Int32 nStartPos,
60 : const Locale& rLocale, sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
61 : throw(RuntimeException, std::exception)
62 : {
63 0 : if (nCount < 0) throw RuntimeException();
64 :
65 0 : return LBI->previousCharacters( Text, nStartPos, rLocale, nCharacterIteratorMode, nCount, nDone);
66 : }
67 :
68 : #define isZWSP(c) (ch == 0x200B)
69 :
70 0 : static sal_Int32 skipSpace(const OUString& Text, sal_Int32 nPos, sal_Int32 len, sal_Int16 rWordType, sal_Bool bDirection)
71 : {
72 0 : sal_uInt32 ch=0;
73 0 : sal_Int32 pos=nPos;
74 0 : switch (rWordType) {
75 : case WordType::ANYWORD_IGNOREWHITESPACES:
76 0 : if (bDirection)
77 0 : while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos;
78 : else
79 0 : while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos;
80 0 : break;
81 : case WordType::DICTIONARY_WORD:
82 0 : if (bDirection)
83 0 : while (nPos < len && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch) ||
84 0 : ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos;
85 : else
86 0 : while (nPos > 0 && (u_isWhitespace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch) ||
87 0 : ! (ch == 0x002E || u_isalnum(ch)))) nPos=pos;
88 0 : break;
89 : case WordType::WORD_COUNT:
90 0 : if (bDirection)
91 0 : while (nPos < len && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, 1)) || isZWSP(ch))) nPos=pos;
92 : else
93 0 : while (nPos > 0 && (u_isUWhiteSpace(ch = Text.iterateCodePoints(&pos, -1)) || isZWSP(ch))) nPos=pos;
94 0 : break;
95 : }
96 0 : return nPos;
97 : }
98 :
99 0 : Boundary SAL_CALL BreakIteratorImpl::nextWord( const OUString& Text, sal_Int32 nStartPos,
100 : const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException, std::exception)
101 : {
102 0 : sal_Int32 len = Text.getLength();
103 0 : if( nStartPos < 0 || len == 0 )
104 0 : result.endPos = result.startPos = 0;
105 0 : else if (nStartPos >= len)
106 0 : result.endPos = result.startPos = len;
107 : else {
108 0 : result = LBI->nextWord(Text, nStartPos, rLocale, rWordType);
109 :
110 0 : nStartPos = skipSpace(Text, result.startPos, len, rWordType, sal_True);
111 :
112 0 : if ( nStartPos != result.startPos) {
113 0 : if( nStartPos >= len )
114 0 : result.startPos = result.endPos = len;
115 : else {
116 0 : result = LBI->getWordBoundary(Text, nStartPos, rLocale, rWordType, sal_True);
117 : // i88041: avoid startPos goes back to nStartPos when switching between Latin and CJK scripts
118 0 : if (result.startPos < nStartPos) result.startPos = nStartPos;
119 : }
120 : }
121 : }
122 0 : return result;
123 : }
124 :
125 0 : static inline sal_Bool SAL_CALL isCJK( const Locale& rLocale ) {
126 0 : return rLocale.Language == "zh" || rLocale.Language == "ja" || rLocale.Language == "ko";
127 : }
128 :
129 0 : Boundary SAL_CALL BreakIteratorImpl::previousWord( const OUString& Text, sal_Int32 nStartPos,
130 : const Locale& rLocale, sal_Int16 rWordType) throw(RuntimeException, std::exception)
131 : {
132 0 : sal_Int32 len = Text.getLength();
133 0 : if( nStartPos <= 0 || len == 0 ) {
134 0 : result.endPos = result.startPos = 0;
135 0 : return result;
136 0 : } else if (nStartPos > len) {
137 0 : result.endPos = result.startPos = len;
138 0 : return result;
139 : }
140 :
141 0 : sal_Int32 nPos = skipSpace(Text, nStartPos, len, rWordType, sal_False);
142 :
143 : // if some spaces are skiped, and the script type is Asian with no CJK rLocale, we have to return
144 : // (nStartPos, -1) for caller to send correct rLocale for loading correct dictionary.
145 0 : result.startPos = nPos;
146 0 : if (nPos != nStartPos && nPos > 0 && !isCJK(rLocale) && getScriptClass(Text.iterateCodePoints(&nPos, -1)) == ScriptType::ASIAN) {
147 0 : result.endPos = -1;
148 0 : return result;
149 : }
150 :
151 0 : return LBI->previousWord(Text, result.startPos, rLocale, rWordType);
152 : }
153 :
154 :
155 0 : Boundary SAL_CALL BreakIteratorImpl::getWordBoundary( const OUString& Text, sal_Int32 nPos, const Locale& rLocale,
156 : sal_Int16 rWordType, sal_Bool bDirection ) throw(RuntimeException, std::exception)
157 : {
158 0 : sal_Int32 len = Text.getLength();
159 0 : if( nPos < 0 || len == 0 )
160 0 : result.endPos = result.startPos = 0;
161 0 : else if (nPos > len)
162 0 : result.endPos = result.startPos = len;
163 : else {
164 : sal_Int32 next, prev;
165 0 : next = skipSpace(Text, nPos, len, rWordType, sal_True);
166 0 : prev = skipSpace(Text, nPos, len, rWordType, sal_False);
167 0 : if (prev == 0 && next == len) {
168 0 : result.endPos = result.startPos = nPos;
169 0 : } else if (prev == 0 && ! bDirection) {
170 0 : result.endPos = result.startPos = 0;
171 0 : } else if (next == len && bDirection) {
172 0 : result.endPos = result.startPos = len;
173 : } else {
174 0 : if (next != prev) {
175 0 : if (next == nPos && next != len)
176 0 : bDirection = sal_True;
177 0 : else if (prev == nPos && prev != 0)
178 0 : bDirection = sal_False;
179 : else
180 0 : nPos = bDirection ? next : prev;
181 : }
182 0 : result = LBI->getWordBoundary(Text, nPos, rLocale, rWordType, bDirection);
183 : }
184 : }
185 0 : return result;
186 : }
187 :
188 0 : sal_Bool SAL_CALL BreakIteratorImpl::isBeginWord( const OUString& Text, sal_Int32 nPos,
189 : const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException, std::exception)
190 : {
191 0 : sal_Int32 len = Text.getLength();
192 :
193 0 : if (nPos < 0 || nPos >= len) return sal_False;
194 :
195 0 : sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, sal_True);
196 :
197 0 : if (tmp != nPos) return sal_False;
198 :
199 0 : result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_True);
200 :
201 0 : return result.startPos == nPos;
202 : }
203 :
204 0 : sal_Bool SAL_CALL BreakIteratorImpl::isEndWord( const OUString& Text, sal_Int32 nPos,
205 : const Locale& rLocale, sal_Int16 rWordType ) throw(RuntimeException, std::exception)
206 : {
207 0 : sal_Int32 len = Text.getLength();
208 :
209 0 : if (nPos <= 0 || nPos > len) return sal_False;
210 :
211 0 : sal_Int32 tmp = skipSpace(Text, nPos, len, rWordType, sal_False);
212 :
213 0 : if (tmp != nPos) return sal_False;
214 :
215 0 : result = getWordBoundary(Text, nPos, rLocale, rWordType, sal_False);
216 :
217 0 : return result.endPos == nPos;
218 : }
219 :
220 0 : sal_Int32 SAL_CALL BreakIteratorImpl::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
221 : const Locale &rLocale ) throw(RuntimeException, std::exception)
222 : {
223 0 : if (nStartPos < 0 || nStartPos > Text.getLength())
224 0 : return -1;
225 0 : if (Text.isEmpty()) return 0;
226 0 : return LBI->beginOfSentence(Text, nStartPos, rLocale);
227 : }
228 :
229 0 : sal_Int32 SAL_CALL BreakIteratorImpl::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
230 : const Locale &rLocale ) throw(RuntimeException, std::exception)
231 : {
232 0 : if (nStartPos < 0 || nStartPos > Text.getLength())
233 0 : return -1;
234 0 : if (Text.isEmpty()) return 0;
235 0 : return LBI->endOfSentence(Text, nStartPos, rLocale);
236 : }
237 :
238 0 : LineBreakResults SAL_CALL BreakIteratorImpl::getLineBreak( const OUString& Text, sal_Int32 nStartPos,
239 : const Locale& rLocale, sal_Int32 nMinBreakPos, const LineBreakHyphenationOptions& hOptions,
240 : const LineBreakUserOptions& bOptions ) throw(RuntimeException, std::exception)
241 : {
242 0 : return LBI->getLineBreak(Text, nStartPos, rLocale, nMinBreakPos, hOptions, bOptions);
243 : }
244 :
245 0 : sal_Int16 SAL_CALL BreakIteratorImpl::getScriptType( const OUString& Text, sal_Int32 nPos )
246 : throw(RuntimeException, std::exception)
247 : {
248 0 : return (nPos < 0 || nPos >= Text.getLength()) ? ScriptType::WEAK :
249 0 : getScriptClass(Text.iterateCodePoints(&nPos, 0));
250 : }
251 :
252 :
253 : /** Increments/decrements position first, then obtains character.
254 : @return current position, may be -1 or text length if string was consumed.
255 : */
256 0 : static sal_Int32 SAL_CALL iterateCodePoints(const OUString& Text, sal_Int32 &nStartPos, sal_Int32 inc, sal_uInt32& ch) {
257 0 : sal_Int32 nLen = Text.getLength();
258 0 : if (nStartPos + inc < 0 || nStartPos + inc >= nLen) {
259 0 : ch = 0;
260 0 : nStartPos = nStartPos + inc < 0 ? -1 : nLen;
261 : } else {
262 0 : ch = Text.iterateCodePoints(&nStartPos, inc);
263 : // Fix for #i80436#.
264 : // erAck: 2009-06-30T21:52+0200 This logic looks somewhat
265 : // suspicious as if it cures a symptom.. anyway, had to add
266 : // nStartPos < Text.getLength() to silence the (correct) assertion
267 : // in rtl_uString_iterateCodePoints() if Text was one character
268 : // (codepoint) only, made up of a surrogate pair.
269 : //if (inc > 0 && nStartPos < Text.getLength())
270 : // ch = Text.iterateCodePoints(&nStartPos, 0);
271 : // With surrogates, nStartPos may actually point behind string
272 : // now, even if inc is only +1
273 0 : if (inc > 0)
274 0 : ch = (nStartPos < nLen ? Text.iterateCodePoints(&nStartPos, 0) : 0);
275 : }
276 0 : return nStartPos;
277 : }
278 :
279 :
280 0 : sal_Int32 SAL_CALL BreakIteratorImpl::beginOfScript( const OUString& Text,
281 : sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException, std::exception)
282 : {
283 0 : if (nStartPos < 0 || nStartPos >= Text.getLength())
284 0 : return -1;
285 :
286 0 : if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
287 0 : return -1;
288 :
289 0 : if (nStartPos == 0) return 0;
290 0 : sal_uInt32 ch=0;
291 0 : while (iterateCodePoints(Text, nStartPos, -1, ch) >= 0 && ScriptType == getScriptClass(ch)) {
292 0 : if (nStartPos == 0) return 0;
293 : }
294 :
295 0 : return iterateCodePoints(Text, nStartPos, 1, ch);
296 : }
297 :
298 0 : sal_Int32 SAL_CALL BreakIteratorImpl::endOfScript( const OUString& Text,
299 : sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException, std::exception)
300 : {
301 0 : if (nStartPos < 0 || nStartPos >= Text.getLength())
302 0 : return -1;
303 :
304 0 : if(ScriptType != getScriptClass(Text.iterateCodePoints(&nStartPos, 0)))
305 0 : return -1;
306 :
307 0 : sal_Int32 strLen = Text.getLength();
308 0 : sal_uInt32 ch=0;
309 0 : while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen ) {
310 0 : sal_Int16 currentCharScriptType = getScriptClass(ch);
311 0 : if(ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK)
312 0 : break;
313 : }
314 0 : return nStartPos;
315 : }
316 :
317 0 : sal_Int32 SAL_CALL BreakIteratorImpl::previousScript( const OUString& Text,
318 : sal_Int32 nStartPos, sal_Int16 ScriptType ) throw(RuntimeException, std::exception)
319 : {
320 0 : if (nStartPos < 0)
321 0 : return -1;
322 0 : if (nStartPos > Text.getLength())
323 0 : nStartPos = Text.getLength();
324 :
325 0 : sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
326 :
327 0 : sal_uInt32 ch=0;
328 0 : while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
329 0 : if ((((numberOfChange % 2) == 0) != (ScriptType != getScriptClass(ch))))
330 0 : numberOfChange--;
331 0 : else if (nStartPos == 0) {
332 0 : return -1;
333 : }
334 : }
335 0 : return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
336 : }
337 :
338 0 : sal_Int32 SAL_CALL BreakIteratorImpl::nextScript( const OUString& Text, sal_Int32 nStartPos,
339 : sal_Int16 ScriptType ) throw(RuntimeException, std::exception)
340 :
341 : {
342 0 : if (nStartPos < 0)
343 0 : nStartPos = 0;
344 0 : sal_Int32 strLen = Text.getLength();
345 0 : if (nStartPos >= strLen)
346 0 : return -1;
347 :
348 0 : sal_Int16 numberOfChange = (ScriptType == getScriptClass(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
349 :
350 0 : sal_uInt32 ch=0;
351 0 : while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
352 0 : sal_Int16 currentCharScriptType = getScriptClass(ch);
353 0 : if ((numberOfChange == 1) ? (ScriptType == currentCharScriptType) :
354 0 : (ScriptType != currentCharScriptType && currentCharScriptType != ScriptType::WEAK))
355 0 : numberOfChange--;
356 : }
357 0 : return numberOfChange == 0 ? nStartPos : -1;
358 : }
359 :
360 0 : sal_Int32 SAL_CALL BreakIteratorImpl::beginOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
361 : const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException, std::exception)
362 : {
363 0 : if (CharType == CharType::ANY_CHAR) return 0;
364 0 : if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
365 0 : if (CharType != (sal_Int16)u_charType( Text.iterateCodePoints(&nStartPos, 0))) return -1;
366 :
367 0 : sal_Int32 nPos=nStartPos;
368 0 : while(nStartPos > 0 && CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nPos, -1))) { nStartPos=nPos; }
369 0 : return nStartPos; // begin of char block is inclusive
370 : }
371 :
372 0 : sal_Int32 SAL_CALL BreakIteratorImpl::endOfCharBlock( const OUString& Text, sal_Int32 nStartPos,
373 : const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException, std::exception)
374 : {
375 0 : sal_Int32 strLen = Text.getLength();
376 :
377 0 : if (CharType == CharType::ANY_CHAR) return strLen; // end of char block is exclusive
378 0 : if (nStartPos < 0 || nStartPos >= strLen) return -1;
379 0 : if (CharType != (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) return -1;
380 :
381 0 : sal_uInt32 ch=0;
382 0 : while(iterateCodePoints(Text, nStartPos, 1, ch) < strLen && CharType == (sal_Int16)u_charType(ch)) {}
383 0 : return nStartPos; // end of char block is exclusive
384 : }
385 :
386 0 : sal_Int32 SAL_CALL BreakIteratorImpl::nextCharBlock( const OUString& Text, sal_Int32 nStartPos,
387 : const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException, std::exception)
388 : {
389 0 : if (CharType == CharType::ANY_CHAR) return -1;
390 0 : if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
391 :
392 0 : sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 2 : 1;
393 0 : sal_Int32 strLen = Text.getLength();
394 :
395 0 : sal_uInt32 ch=0;
396 0 : while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, 1, ch) < strLen) {
397 0 : if ((CharType != (sal_Int16)u_charType(ch)) != (numberOfChange == 1))
398 0 : numberOfChange--;
399 : }
400 0 : return numberOfChange == 0 ? nStartPos : -1;
401 : }
402 :
403 0 : sal_Int32 SAL_CALL BreakIteratorImpl::previousCharBlock( const OUString& Text, sal_Int32 nStartPos,
404 : const Locale& /*rLocale*/, sal_Int16 CharType ) throw(RuntimeException, std::exception)
405 : {
406 0 : if(CharType == CharType::ANY_CHAR) return -1;
407 0 : if (nStartPos < 0 || nStartPos >= Text.getLength()) return -1;
408 :
409 0 : sal_Int16 numberOfChange = (CharType == (sal_Int16)u_charType(Text.iterateCodePoints(&nStartPos, 0))) ? 3 : 2;
410 :
411 0 : sal_uInt32 ch=0;
412 0 : while (numberOfChange > 0 && iterateCodePoints(Text, nStartPos, -1, ch) >= 0) {
413 0 : if (((numberOfChange % 2) == 0) != (CharType != (sal_Int16)u_charType(ch)))
414 0 : numberOfChange--;
415 0 : if (nStartPos == 0 && numberOfChange > 0) {
416 0 : numberOfChange--;
417 0 : if (numberOfChange == 0) return nStartPos;
418 : }
419 : }
420 0 : return numberOfChange == 0 ? iterateCodePoints(Text, nStartPos, 1, ch) : -1;
421 : }
422 :
423 :
424 :
425 0 : sal_Int16 SAL_CALL BreakIteratorImpl::getWordType( const OUString& /*Text*/,
426 : sal_Int32 /*nPos*/, const Locale& /*rLocale*/ ) throw(RuntimeException, std::exception)
427 : {
428 0 : return 0;
429 : }
430 :
431 : namespace
432 : {
433 0 : sal_Int16 getScriptClassByUAX24Script(sal_uInt32 currentChar)
434 : {
435 0 : int32_t script = u_getIntPropertyValue(currentChar, UCHAR_SCRIPT);
436 0 : return unicode::getScriptClassFromUScriptCode(static_cast<UScriptCode>(script));
437 : }
438 :
439 : struct UBlock2Script
440 : {
441 : UBlockCode from;
442 : UBlockCode to;
443 : sal_Int16 script;
444 : };
445 :
446 : static const UBlock2Script scriptList[] =
447 : {
448 : {UBLOCK_NO_BLOCK, UBLOCK_NO_BLOCK, ScriptType::WEAK},
449 : {UBLOCK_BASIC_LATIN, UBLOCK_ARMENIAN, ScriptType::LATIN},
450 : {UBLOCK_HEBREW, UBLOCK_MYANMAR, ScriptType::COMPLEX},
451 : {UBLOCK_GEORGIAN, UBLOCK_GEORGIAN, ScriptType::LATIN},
452 : {UBLOCK_HANGUL_JAMO, UBLOCK_HANGUL_JAMO, ScriptType::ASIAN},
453 : {UBLOCK_ETHIOPIC, UBLOCK_ETHIOPIC, ScriptType::COMPLEX},
454 : {UBLOCK_CHEROKEE, UBLOCK_RUNIC, ScriptType::LATIN},
455 : {UBLOCK_KHMER, UBLOCK_MONGOLIAN, ScriptType::COMPLEX},
456 : {UBLOCK_LATIN_EXTENDED_ADDITIONAL, UBLOCK_GREEK_EXTENDED, ScriptType::LATIN},
457 : {UBLOCK_NUMBER_FORMS, UBLOCK_NUMBER_FORMS, ScriptType::WEAK},
458 : {UBLOCK_CJK_RADICALS_SUPPLEMENT, UBLOCK_HANGUL_SYLLABLES, ScriptType::ASIAN},
459 : {UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS, ScriptType::ASIAN},
460 : {UBLOCK_ARABIC_PRESENTATION_FORMS_A, UBLOCK_ARABIC_PRESENTATION_FORMS_A, ScriptType::COMPLEX},
461 : {UBLOCK_CJK_COMPATIBILITY_FORMS, UBLOCK_CJK_COMPATIBILITY_FORMS, ScriptType::ASIAN},
462 : {UBLOCK_ARABIC_PRESENTATION_FORMS_B, UBLOCK_ARABIC_PRESENTATION_FORMS_B, ScriptType::COMPLEX},
463 : {UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, UBLOCK_HALFWIDTH_AND_FULLWIDTH_FORMS, ScriptType::ASIAN},
464 : {UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, UBLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, ScriptType::ASIAN},
465 : {UBLOCK_CJK_STROKES, UBLOCK_CJK_STROKES, ScriptType::ASIAN},
466 : {UBLOCK_LATIN_EXTENDED_C, UBLOCK_LATIN_EXTENDED_D, ScriptType::LATIN}
467 : };
468 :
469 : #define scriptListCount SAL_N_ELEMENTS(scriptList)
470 :
471 : //always sets rScriptType
472 :
473 : //returns true for characters historically explicitly assigned to
474 : //latin/weak/asian
475 :
476 : //returns false for characters that historically implicitly assigned to
477 : //weak as unknown
478 0 : bool getCompatibilityScriptClassByBlock(sal_uInt32 currentChar, sal_Int16 &rScriptType)
479 : {
480 0 : bool bKnown = true;
481 : //handle specific characters always as weak:
482 : // 0x01 - this breaks a word
483 : // 0x02 - this can be inside a word
484 : // 0x20 & 0xA0 - Bug 102975, declare western space and non-break space as WEAK char.
485 0 : if( 0x01 == currentChar || 0x02 == currentChar || 0x20 == currentChar || 0xA0 == currentChar)
486 0 : rScriptType = ScriptType::WEAK;
487 : // workaround for Coptic
488 0 : else if ( 0x2C80 <= currentChar && 0x2CE3 >= currentChar)
489 0 : rScriptType = ScriptType::LATIN;
490 : else
491 : {
492 0 : UBlockCode block=ublock_getCode(currentChar);
493 0 : size_t i = 0;
494 0 : while (i < scriptListCount)
495 : {
496 0 : if (block <= scriptList[i].to)
497 0 : break;
498 0 : ++i;
499 : }
500 0 : if (i < scriptListCount && block >= scriptList[i].from)
501 0 : rScriptType = scriptList[i].script;
502 : else
503 : {
504 0 : rScriptType = ScriptType::WEAK;
505 0 : bKnown = false;
506 : }
507 : }
508 0 : return bKnown;
509 : }
510 : }
511 :
512 0 : sal_Int16 BreakIteratorImpl::getScriptClass(sal_uInt32 currentChar)
513 : {
514 : static sal_uInt32 lastChar = 0;
515 : static sal_Int16 nRet = 0;
516 :
517 0 : if (currentChar != lastChar)
518 : {
519 0 : lastChar = currentChar;
520 :
521 0 : if (!getCompatibilityScriptClassByBlock(currentChar, nRet))
522 0 : nRet = getScriptClassByUAX24Script(currentChar);
523 : }
524 :
525 0 : return nRet;
526 : }
527 :
528 0 : static inline sal_Bool operator == (const Locale& l1, const Locale& l2) {
529 0 : return l1.Language == l2.Language && l1.Country == l2.Country && l1.Variant == l2.Variant;
530 : }
531 :
532 0 : sal_Bool SAL_CALL BreakIteratorImpl::createLocaleSpecificBreakIterator(const OUString& aLocaleName) throw( RuntimeException )
533 : {
534 : // to share service between same Language but different Country code, like zh_CN and zh_TW
535 0 : for (size_t l = 0; l < lookupTable.size(); l++) {
536 0 : lookupTableItem *listItem = lookupTable[l];
537 0 : if (aLocaleName == listItem->aLocale.Language) {
538 0 : xBI = listItem->xBI;
539 0 : return sal_True;
540 : }
541 : }
542 :
543 0 : Reference < uno::XInterface > xI = m_xContext->getServiceManager()->createInstanceWithContext(
544 0 : OUString("com.sun.star.i18n.BreakIterator_") + aLocaleName, m_xContext);
545 :
546 0 : if ( xI.is() ) {
547 0 : xBI.set(xI, UNO_QUERY);
548 0 : if (xBI.is()) {
549 0 : lookupTable.push_back(new lookupTableItem(Locale(aLocaleName, aLocaleName, aLocaleName), xBI));
550 0 : return sal_True;
551 : }
552 : }
553 0 : return sal_False;
554 : }
555 :
556 : Reference < XBreakIterator > SAL_CALL
557 0 : BreakIteratorImpl::getLocaleSpecificBreakIterator(const Locale& rLocale) throw (RuntimeException)
558 : {
559 0 : if (xBI.is() && rLocale == aLocale)
560 0 : return xBI;
561 0 : else if (m_xContext.is()) {
562 0 : aLocale = rLocale;
563 :
564 0 : for (size_t i = 0; i < lookupTable.size(); i++) {
565 0 : lookupTableItem *listItem = lookupTable[i];
566 0 : if (rLocale == listItem->aLocale)
567 0 : return xBI = listItem->xBI;
568 : }
569 :
570 0 : sal_Unicode under = (sal_Unicode)'_';
571 :
572 0 : sal_Int32 l = rLocale.Language.getLength();
573 0 : sal_Int32 c = rLocale.Country.getLength();
574 0 : sal_Int32 v = rLocale.Variant.getLength();
575 0 : OUStringBuffer aBuf(l+c+v+3);
576 :
577 0 : if ((l > 0 && c > 0 && v > 0 &&
578 : // load service with name <base>_<lang>_<country>_<varian>
579 0 : createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append(
580 0 : rLocale.Country).append(under).append(rLocale.Variant).makeStringAndClear())) ||
581 0 : (l > 0 && c > 0 &&
582 : // load service with name <base>_<lang>_<country>
583 0 : createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).append(
584 0 : rLocale.Country).makeStringAndClear())) ||
585 0 : (l > 0 && c > 0 && rLocale.Language.equalsAscii("zh") &&
586 0 : (rLocale.Country.equalsAscii("HK") ||
587 0 : rLocale.Country.equalsAscii("MO") ) &&
588 : // if the country code is HK or MO, one more step to try TW.
589 0 : createLocaleSpecificBreakIterator(aBuf.append(rLocale.Language).append(under).appendAscii(
590 0 : "TW").makeStringAndClear())) ||
591 0 : (l > 0 &&
592 : // load service with name <base>_<lang>
593 0 : createLocaleSpecificBreakIterator(rLocale.Language)) ||
594 : // load default service with name <base>_Unicode
595 0 : createLocaleSpecificBreakIterator(OUString("Unicode"))) {
596 0 : lookupTable.push_back( new lookupTableItem(aLocale, xBI) );
597 0 : return xBI;
598 0 : }
599 : }
600 0 : throw RuntimeException();
601 : }
602 :
603 : OUString SAL_CALL
604 0 : BreakIteratorImpl::getImplementationName(void) throw( RuntimeException, std::exception )
605 : {
606 0 : return OUString("com.sun.star.i18n.BreakIterator");
607 : }
608 :
609 : sal_Bool SAL_CALL
610 0 : BreakIteratorImpl::supportsService(const OUString& rServiceName) throw( RuntimeException, std::exception )
611 : {
612 0 : return cppu::supportsService(this, rServiceName);
613 : }
614 :
615 : Sequence< OUString > SAL_CALL
616 0 : BreakIteratorImpl::getSupportedServiceNames(void) throw( RuntimeException, std::exception )
617 : {
618 0 : Sequence< OUString > aRet(1);
619 0 : aRet[0] = OUString("com.sun.star.i18n.BreakIterator");
620 0 : return aRet;
621 : }
622 :
623 : } } } }
624 :
625 : extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface * SAL_CALL
626 0 : com_sun_star_i18n_BreakIterator_get_implementation(
627 : css::uno::XComponentContext *context,
628 : css::uno::Sequence<css::uno::Any> const &)
629 : {
630 0 : return cppu::acquire(new com::sun::star::i18n::BreakIteratorImpl(context));
631 : }
632 :
633 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|