Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include <breakiterator_unicode.hxx>
21 : #include <localedata.hxx>
22 : #include <unicode/uchar.h>
23 : #include <unicode/locid.h>
24 : #include <unicode/rbbi.h>
25 : #include <unicode/udata.h>
26 : #include <rtl/strbuf.hxx>
27 : #include <rtl/ustring.hxx>
28 : #include <string.h>
29 :
30 : U_CDECL_BEGIN
31 : extern const char OpenOffice_dat[];
32 : U_CDECL_END
33 :
34 : using namespace ::com::sun::star;
35 : using namespace ::com::sun::star::lang;
36 : using namespace ::rtl;
37 :
38 : namespace com { namespace sun { namespace star { namespace i18n {
39 :
40 : #define ERROR ::com::sun::star::uno::RuntimeException()
41 :
42 :
43 189 : BreakIterator_Unicode::BreakIterator_Unicode() :
44 : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ), // implementation name
45 : wordRule( "word" ),
46 : lineRule( "line" ),
47 : result(),
48 : character(),
49 : word(),
50 : sentence(),
51 : line(),
52 : icuBI( NULL ),
53 : aLocale(),
54 : aBreakType(),
55 189 : aWordType()
56 : {
57 189 : }
58 :
59 :
60 409 : BreakIterator_Unicode::~BreakIterator_Unicode()
61 : {
62 139 : if (icuBI && icuBI->aBreakIterator) {
63 133 : delete icuBI->aBreakIterator;
64 133 : icuBI->aBreakIterator=NULL;
65 : }
66 139 : if (character.aBreakIterator) delete character.aBreakIterator;
67 139 : if (word.aBreakIterator) delete word.aBreakIterator;
68 139 : if (sentence.aBreakIterator) delete sentence.aBreakIterator;
69 139 : if (line.aBreakIterator) delete line.aBreakIterator;
70 270 : }
71 :
72 : /*
73 : Wrapper class to provide public access to the RuleBasedBreakIterator's
74 : setbreakType method.
75 : */
76 29108 : class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
77 : public:
78 14606 : inline void publicSetBreakType(int32_t type) {
79 14606 : setBreakType(type);
80 14606 : };
81 14606 : OOoRuleBasedBreakIterator(UDataMemory* image,
82 : UErrorCode &status) :
83 14606 : RuleBasedBreakIterator(image, status) { };
84 :
85 : };
86 :
87 : // loading ICU breakiterator on demand.
88 42421 : void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
89 : sal_Int16 rBreakType, sal_Int16 rWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
90 : {
91 42421 : sal_Bool newBreak = sal_False;
92 42421 : UErrorCode status = U_ZERO_ERROR;
93 42421 : sal_Int16 breakType = 0;
94 42421 : switch (rBreakType) {
95 21746 : case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
96 11891 : case LOAD_WORD_BREAKITERATOR: icuBI=&word;
97 11891 : switch (rWordType) {
98 46 : case WordType::ANYWORD_IGNOREWHITESPACES: breakType = 0; rule=wordRule = "edit_word"; break;
99 10279 : case WordType::DICTIONARY_WORD: breakType = 1; rule=wordRule = "dict_word"; break;
100 1566 : case WordType::WORD_COUNT: breakType = 2; rule=wordRule = "count_word"; break;
101 : }
102 11891 : break;
103 26 : case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
104 8758 : case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
105 : }
106 125907 : if (!icuBI->aBreakIterator || rWordType != aWordType ||
107 55675 : rLocale.Language != aLocale.Language || rLocale.Country != aLocale.Country ||
108 27811 : rLocale.Variant != aLocale.Variant) {
109 14610 : if (icuBI->aBreakIterator) {
110 14314 : delete icuBI->aBreakIterator;
111 14314 : icuBI->aBreakIterator=NULL;
112 : }
113 14610 : if (rule) {
114 14610 : uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale);
115 :
116 14610 : status = U_ZERO_ERROR;
117 14610 : udata_setAppData("OpenOffice", OpenOffice_dat, &status);
118 14610 : if ( !U_SUCCESS(status) ) throw ERROR;
119 :
120 14610 : OOoRuleBasedBreakIterator *rbi = NULL;
121 :
122 14610 : if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
123 : {
124 : rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
125 12 : OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
126 : }
127 14598 : else if (rLocale.Language != "th" && rLocale.Language != "km") //use icu's breakiterator for Thai and Khmer
128 : {
129 14594 : status = U_ZERO_ERROR;
130 14594 : OStringBuffer aUDName(64);
131 14594 : aUDName.append(rule);
132 14594 : aUDName.append('_');
133 14594 : aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
134 14594 : UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
135 14594 : if( U_SUCCESS(status) )
136 26 : rbi = new OOoRuleBasedBreakIterator( pUData, status);
137 14594 : if (!U_SUCCESS(status) ) {
138 14568 : status = U_ZERO_ERROR;
139 14568 : pUData = udata_open("OpenOffice", "brk", rule, &status);
140 14568 : if( U_SUCCESS(status) )
141 14568 : rbi = new OOoRuleBasedBreakIterator( pUData, status);
142 14568 : if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
143 14594 : }
144 : }
145 14610 : if (rbi) {
146 14606 : switch (rBreakType) {
147 1576 : case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
148 7359 : case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
149 1 : case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
150 5670 : case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
151 : }
152 14606 : icuBI->aBreakIterator = rbi;
153 14610 : }
154 : }
155 :
156 14610 : if (!icuBI->aBreakIterator) {
157 : icu::Locale icuLocale(
158 : OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(),
159 : OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(),
160 4 : OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr());
161 :
162 4 : status = U_ZERO_ERROR;
163 4 : switch (rBreakType) {
164 : case LOAD_CHARACTER_BREAKITERATOR:
165 0 : icuBI->aBreakIterator = icu::BreakIterator::createCharacterInstance(icuLocale, status);
166 0 : break;
167 : case LOAD_WORD_BREAKITERATOR:
168 4 : icuBI->aBreakIterator = icu::BreakIterator::createWordInstance(icuLocale, status);
169 4 : break;
170 : case LOAD_SENTENCE_BREAKITERATOR:
171 0 : icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
172 0 : break;
173 : case LOAD_LINE_BREAKITERATOR:
174 0 : icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
175 0 : break;
176 : }
177 4 : if ( !U_SUCCESS(status) ) {
178 0 : icuBI->aBreakIterator=NULL;
179 0 : throw ERROR;
180 4 : }
181 : }
182 14610 : if (icuBI->aBreakIterator) {
183 14610 : aLocale=rLocale;
184 14610 : aWordType=rWordType;
185 14610 : aBreakType=rBreakType;
186 14610 : newBreak=sal_True;
187 : } else {
188 0 : throw ERROR;
189 : }
190 : }
191 :
192 42421 : if (newBreak || !icuBI->aICUText.equals(rText))
193 : {
194 : // UChar != sal_Unicode in MinGW
195 16153 : const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
196 :
197 16153 : icuBI->ut = utext_openUChars(icuBI->ut, pText, rText.getLength(), &status);
198 :
199 16153 : if (!U_SUCCESS(status))
200 0 : throw ERROR;
201 :
202 16153 : icuBI->aBreakIterator->setText(icuBI->ut, status);
203 :
204 16153 : if (!U_SUCCESS(status))
205 0 : throw ERROR;
206 :
207 16153 : icuBI->aICUText = rText;
208 : }
209 42421 : }
210 :
211 22498 : sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
212 : sal_Int32 nStartPos, const lang::Locale &rLocale,
213 : sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
214 : throw(uno::RuntimeException)
215 : {
216 22498 : if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
217 21726 : loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
218 43452 : for (nDone = 0; nDone < nCount; nDone++) {
219 21726 : nStartPos = character.aBreakIterator->following(nStartPos);
220 21726 : if (nStartPos == BreakIterator::DONE)
221 0 : return Text.getLength();
222 : }
223 : } else { // for CHARACTER mode
224 1544 : for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
225 772 : Text.iterateCodePoints(&nStartPos, 1);
226 : }
227 22498 : return nStartPos;
228 : }
229 :
230 82 : sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
231 : sal_Int32 nStartPos, const lang::Locale& rLocale,
232 : sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
233 : throw(uno::RuntimeException)
234 : {
235 82 : if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
236 20 : loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
237 40 : for (nDone = 0; nDone < nCount; nDone++) {
238 20 : nStartPos = character.aBreakIterator->preceding(nStartPos);
239 20 : if (nStartPos == BreakIterator::DONE)
240 0 : return 0;
241 : }
242 : } else { // for BS to delete one char and CHARACTER mode.
243 124 : for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
244 62 : Text.iterateCodePoints(&nStartPos, -1);
245 : }
246 82 : return nStartPos;
247 : }
248 :
249 :
250 48 : Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
251 : const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException)
252 : {
253 48 : loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
254 :
255 48 : result.startPos = word.aBreakIterator->following(nStartPos);
256 48 : if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
257 5 : result.endPos = result.startPos;
258 : else {
259 86 : if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
260 : rWordType == WordType::DICTIONARY_WORD ) &&
261 43 : u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
262 6 : result.startPos = word.aBreakIterator->following(result.startPos);
263 :
264 43 : result.endPos = word.aBreakIterator->following(result.startPos);
265 43 : if(result.endPos == BreakIterator::DONE)
266 0 : result.endPos = result.startPos;
267 : }
268 48 : return result;
269 : }
270 :
271 :
272 20 : Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
273 : const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException)
274 : {
275 20 : loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
276 :
277 20 : result.startPos = word.aBreakIterator->preceding(nStartPos);
278 20 : if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
279 0 : result.endPos = result.startPos;
280 : else {
281 40 : if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
282 : rWordType == WordType::DICTIONARY_WORD) &&
283 20 : u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
284 0 : result.startPos = word.aBreakIterator->preceding(result.startPos);
285 :
286 20 : result.endPos = word.aBreakIterator->following(result.startPos);
287 20 : if(result.endPos == BreakIterator::DONE)
288 0 : result.endPos = result.startPos;
289 : }
290 20 : return result;
291 : }
292 :
293 :
294 11823 : Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
295 : sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException)
296 : {
297 11823 : loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
298 11823 : sal_Int32 len = Text.getLength();
299 :
300 11823 : if(word.aBreakIterator->isBoundary(nPos)) {
301 6079 : result.startPos = result.endPos = nPos;
302 6079 : if((bDirection || nPos == 0) && nPos < len) //forward
303 6079 : result.endPos = word.aBreakIterator->following(nPos);
304 : else
305 0 : result.startPos = word.aBreakIterator->preceding(nPos);
306 : } else {
307 5744 : if(nPos <= 0) {
308 0 : result.startPos = 0;
309 0 : result.endPos = len ? word.aBreakIterator->following((sal_Int32)0) : 0;
310 5744 : } else if(nPos >= len) {
311 0 : result.startPos = word.aBreakIterator->preceding(len);
312 0 : result.endPos = len;
313 : } else {
314 5744 : result.startPos = word.aBreakIterator->preceding(nPos);
315 5744 : result.endPos = word.aBreakIterator->following(nPos);
316 : }
317 : }
318 11823 : if (result.startPos == BreakIterator::DONE)
319 0 : result.startPos = result.endPos;
320 11823 : else if (result.endPos == BreakIterator::DONE)
321 0 : result.endPos = result.startPos;
322 :
323 11823 : return result;
324 : }
325 :
326 :
327 0 : sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
328 : const lang::Locale &rLocale ) throw(uno::RuntimeException)
329 : {
330 0 : loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
331 :
332 0 : sal_Int32 len = Text.getLength();
333 0 : if (len > 0 && nStartPos == len)
334 0 : Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
335 0 : if (!sentence.aBreakIterator->isBoundary(nStartPos))
336 0 : nStartPos = sentence.aBreakIterator->preceding(nStartPos);
337 :
338 : // skip preceding space.
339 0 : sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
340 0 : while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
341 0 : Text.iterateCodePoints(&nStartPos, -1);
342 :
343 0 : return nStartPos;
344 : }
345 :
346 26 : sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
347 : const lang::Locale &rLocale ) throw(uno::RuntimeException)
348 : {
349 26 : loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
350 :
351 26 : sal_Int32 len = Text.getLength();
352 26 : if (len > 0 && nStartPos == len)
353 0 : Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
354 26 : nStartPos = sentence.aBreakIterator->following(nStartPos);
355 :
356 26 : sal_Int32 nPos=nStartPos;
357 26 : while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
358 :
359 26 : return nStartPos;
360 : }
361 :
362 8758 : LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
363 : const OUString& Text, sal_Int32 nStartPos,
364 : const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
365 : const LineBreakHyphenationOptions& hOptions,
366 : const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException)
367 : {
368 8758 : LineBreakResults lbr;
369 :
370 8758 : if (nStartPos >= Text.getLength()) {
371 0 : lbr.breakIndex = Text.getLength();
372 0 : lbr.breakType = BreakType::WORDBOUNDARY;
373 0 : return lbr;
374 : }
375 :
376 8758 : loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
377 :
378 8758 : sal_Bool GlueSpace=sal_True;
379 26274 : while (GlueSpace) {
380 8758 : if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
381 3520 : lbr.breakIndex = nStartPos;
382 3520 : lbr.breakType = BreakType::WORDBOUNDARY;
383 5238 : } else if (hOptions.rHyphenator.is()) { //Hyphenation break
384 : Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale,
385 2870 : WordType::DICTIONARY_WORD, false);
386 2870 : uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
387 2870 : aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
388 : wBoundary.endPos - wBoundary.startPos), rLocale,
389 2870 : (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions);
390 2870 : if (aHyphenatedWord.is()) {
391 0 : lbr.rHyphenatedWord = aHyphenatedWord;
392 0 : if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
393 0 : lbr.breakIndex = -1;
394 : else
395 0 : lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
396 0 : lbr.breakType = BreakType::HYPHENATION;
397 : } else {
398 2870 : lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
399 2870 : lbr.breakType = BreakType::WORDBOUNDARY;;
400 2870 : }
401 : } else { //word boundary break
402 2368 : lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
403 2368 : lbr.breakType = BreakType::WORDBOUNDARY;
404 : }
405 :
406 : #define WJ 0x2060 // Word Joiner
407 8758 : GlueSpace=sal_False;
408 8758 : if (lbr.breakType == BreakType::WORDBOUNDARY) {
409 8758 : nStartPos = lbr.breakIndex;
410 8758 : if (Text[nStartPos--] == WJ)
411 0 : GlueSpace=sal_True;
412 23452 : while (nStartPos >= 0 &&
413 4452 : (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
414 1484 : if (Text[nStartPos--] == WJ)
415 0 : GlueSpace=sal_True;
416 : }
417 8758 : if (GlueSpace && nStartPos < 0) {
418 0 : lbr.breakIndex = 0;
419 0 : break;
420 : }
421 : }
422 : }
423 :
424 8758 : return lbr;
425 : }
426 :
427 :
428 :
429 : OUString SAL_CALL
430 0 : BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException )
431 : {
432 0 : return OUString::createFromAscii(cBreakIterator);
433 : }
434 :
435 : sal_Bool SAL_CALL
436 0 : BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException )
437 : {
438 0 : return !rServiceName.compareToAscii(cBreakIterator);
439 : }
440 :
441 : uno::Sequence< OUString > SAL_CALL
442 0 : BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException )
443 : {
444 0 : uno::Sequence< OUString > aRet(1);
445 0 : aRet[0] = OUString::createFromAscii(cBreakIterator);
446 0 : return aRet;
447 : }
448 :
449 : } } } }
450 :
451 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|