Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include <breakiterator_unicode.hxx>
21 : #include <localedata.hxx>
22 : #include <unicode/uchar.h>
23 : #include <unicode/locid.h>
24 : #include <unicode/rbbi.h>
25 : #include <unicode/udata.h>
26 : #include <rtl/strbuf.hxx>
27 : #include <rtl/ustring.hxx>
28 : #include <string.h>
29 :
30 : U_CDECL_BEGIN
31 : extern const char OpenOffice_dat[];
32 : U_CDECL_END
33 :
34 : using namespace ::com::sun::star;
35 : using namespace ::com::sun::star::lang;
36 : using namespace ::rtl;
37 :
38 : namespace com { namespace sun { namespace star { namespace i18n {
39 :
40 : #define ERROR ::com::sun::star::uno::RuntimeException()
41 :
42 :
43 93 : BreakIterator_Unicode::BreakIterator_Unicode() :
44 : cBreakIterator( "com.sun.star.i18n.BreakIterator_Unicode" ), // implementation name
45 : wordRule( "word" ),
46 : lineRule( "line" ),
47 93 : icuBI( NULL )
48 : {
49 93 : }
50 :
51 540 : BreakIterator_Unicode::~BreakIterator_Unicode()
52 : {
53 68 : delete character.aBreakIterator;
54 68 : delete sentence.aBreakIterator;
55 68 : delete line.aBreakIterator;
56 340 : for (size_t i = 0; i < SAL_N_ELEMENTS(words); i++)
57 272 : delete words[i].aBreakIterator;
58 472 : }
59 :
60 : /*
61 : Wrapper class to provide public access to the RuleBasedBreakIterator's
62 : setbreakType method.
63 : */
64 418 : class OOoRuleBasedBreakIterator : public RuleBasedBreakIterator {
65 : public:
66 236 : inline void publicSetBreakType(int32_t type) {
67 236 : setBreakType(type);
68 236 : };
69 236 : OOoRuleBasedBreakIterator(UDataMemory* image,
70 : UErrorCode &status) :
71 236 : RuleBasedBreakIterator(image, status) { };
72 :
73 : };
74 :
75 : // loading ICU breakiterator on demand.
76 19122 : void SAL_CALL BreakIterator_Unicode::loadICUBreakIterator(const com::sun::star::lang::Locale& rLocale,
77 : sal_Int16 rBreakType, sal_Int16 nWordType, const sal_Char *rule, const OUString& rText) throw(uno::RuntimeException)
78 : {
79 19122 : sal_Bool newBreak = sal_False;
80 19122 : UErrorCode status = U_ZERO_ERROR;
81 19122 : sal_Int16 breakType = 0;
82 19122 : switch (rBreakType) {
83 9275 : case LOAD_CHARACTER_BREAKITERATOR: icuBI=&character; breakType = 3; break;
84 : case LOAD_WORD_BREAKITERATOR:
85 : assert (nWordType >= 0 && nWordType<= WordType::WORD_COUNT);
86 5488 : icuBI=&words[nWordType];
87 5488 : switch (nWordType) {
88 5 : case WordType::ANY_WORD: break; // odd but previous behavior
89 : case WordType::ANYWORD_IGNOREWHITESPACES:
90 23 : breakType = 0; rule = wordRule = "edit_word"; break;
91 : case WordType::DICTIONARY_WORD:
92 4833 : breakType = 1; rule = wordRule = "dict_word"; break;
93 : default:
94 : case WordType::WORD_COUNT:
95 627 : breakType = 2; rule = wordRule = "count_word"; break;
96 : }
97 5488 : break;
98 5 : case LOAD_SENTENCE_BREAKITERATOR: icuBI=&sentence; breakType = 5; break;
99 4354 : case LOAD_LINE_BREAKITERATOR: icuBI=&line; breakType = 4; break;
100 : }
101 75884 : if (!icuBI->aBreakIterator ||
102 18968 : rLocale.Language != icuBI->maLocale.Language ||
103 18911 : rLocale.Country != icuBI->maLocale.Country ||
104 18883 : rLocale.Variant != icuBI->maLocale.Variant) {
105 239 : if (icuBI->aBreakIterator) {
106 85 : delete icuBI->aBreakIterator;
107 85 : icuBI->aBreakIterator=NULL;
108 : }
109 239 : if (rule) {
110 238 : uno::Sequence< OUString > breakRules = LocaleData().getBreakIteratorRules(rLocale);
111 :
112 238 : status = U_ZERO_ERROR;
113 238 : udata_setAppData("OpenOffice", OpenOffice_dat, &status);
114 238 : if ( !U_SUCCESS(status) ) throw ERROR;
115 :
116 238 : OOoRuleBasedBreakIterator *rbi = NULL;
117 :
118 238 : if (breakRules.getLength() > breakType && !breakRules[breakType].isEmpty())
119 : {
120 : rbi = new OOoRuleBasedBreakIterator(udata_open("OpenOffice", "brk",
121 5 : OUStringToOString(breakRules[breakType], RTL_TEXTENCODING_ASCII_US).getStr(), &status), status);
122 : }
123 233 : else if (rLocale.Language != "th" && rLocale.Language != "km") //use icu's breakiterator for Thai and Khmer
124 : {
125 231 : status = U_ZERO_ERROR;
126 231 : OStringBuffer aUDName(64);
127 231 : aUDName.append(rule);
128 231 : aUDName.append('_');
129 231 : aUDName.append( OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US));
130 231 : UDataMemory* pUData = udata_open("OpenOffice", "brk", aUDName.getStr(), &status);
131 231 : if( U_SUCCESS(status) )
132 7 : rbi = new OOoRuleBasedBreakIterator( pUData, status);
133 231 : if (!U_SUCCESS(status) ) {
134 224 : status = U_ZERO_ERROR;
135 224 : pUData = udata_open("OpenOffice", "brk", rule, &status);
136 224 : if( U_SUCCESS(status) )
137 224 : rbi = new OOoRuleBasedBreakIterator( pUData, status);
138 224 : if (!U_SUCCESS(status) ) icuBI->aBreakIterator=NULL;
139 231 : }
140 : }
141 238 : if (rbi) {
142 236 : switch (rBreakType) {
143 12 : case LOAD_CHARACTER_BREAKITERATOR: rbi->publicSetBreakType(UBRK_CHARACTER); break;
144 140 : case LOAD_WORD_BREAKITERATOR: rbi->publicSetBreakType(UBRK_WORD); break;
145 1 : case LOAD_SENTENCE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_SENTENCE); break;
146 83 : case LOAD_LINE_BREAKITERATOR: rbi->publicSetBreakType(UBRK_LINE); break;
147 : }
148 236 : icuBI->aBreakIterator = rbi;
149 238 : }
150 : }
151 :
152 239 : if (!icuBI->aBreakIterator) {
153 : icu::Locale icuLocale(
154 : OUStringToOString(rLocale.Language, RTL_TEXTENCODING_ASCII_US).getStr(),
155 : OUStringToOString(rLocale.Country, RTL_TEXTENCODING_ASCII_US).getStr(),
156 3 : OUStringToOString(rLocale.Variant, RTL_TEXTENCODING_ASCII_US).getStr());
157 :
158 3 : status = U_ZERO_ERROR;
159 3 : switch (rBreakType) {
160 : case LOAD_CHARACTER_BREAKITERATOR:
161 0 : icuBI->aBreakIterator = icu::BreakIterator::createCharacterInstance(icuLocale, status);
162 0 : break;
163 : case LOAD_WORD_BREAKITERATOR:
164 3 : icuBI->aBreakIterator = icu::BreakIterator::createWordInstance(icuLocale, status);
165 3 : break;
166 : case LOAD_SENTENCE_BREAKITERATOR:
167 0 : icuBI->aBreakIterator = icu::BreakIterator::createSentenceInstance(icuLocale, status);
168 0 : break;
169 : case LOAD_LINE_BREAKITERATOR:
170 0 : icuBI->aBreakIterator = icu::BreakIterator::createLineInstance(icuLocale, status);
171 0 : break;
172 : }
173 3 : if ( !U_SUCCESS(status) ) {
174 0 : icuBI->aBreakIterator=NULL;
175 0 : throw ERROR;
176 3 : }
177 : }
178 239 : if (icuBI->aBreakIterator) {
179 239 : icuBI->maLocale=rLocale;
180 239 : newBreak=sal_True;
181 : } else {
182 0 : throw ERROR;
183 : }
184 : }
185 :
186 19122 : if (newBreak || !icuBI->aICUText.equals(rText))
187 : {
188 : // UChar != sal_Unicode in MinGW
189 3463 : const UChar *pText = reinterpret_cast<const UChar *>(rText.getStr());
190 :
191 3463 : icuBI->ut = utext_openUChars(icuBI->ut, pText, rText.getLength(), &status);
192 :
193 3463 : if (!U_SUCCESS(status))
194 0 : throw ERROR;
195 :
196 3463 : icuBI->aBreakIterator->setText(icuBI->ut, status);
197 :
198 3463 : if (!U_SUCCESS(status))
199 0 : throw ERROR;
200 :
201 3463 : icuBI->aICUText = rText;
202 : }
203 19122 : }
204 :
205 9649 : sal_Int32 SAL_CALL BreakIterator_Unicode::nextCharacters( const OUString& Text,
206 : sal_Int32 nStartPos, const lang::Locale &rLocale,
207 : sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
208 : throw(uno::RuntimeException)
209 : {
210 9649 : if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
211 9265 : loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
212 18525 : for (nDone = 0; nDone < nCount; nDone++) {
213 9260 : nStartPos = character.aBreakIterator->following(nStartPos);
214 9260 : if (nStartPos == BreakIterator::DONE)
215 0 : return Text.getLength();
216 : }
217 : } else { // for CHARACTER mode
218 768 : for (nDone = 0; nDone < nCount && nStartPos < Text.getLength(); nDone++)
219 384 : Text.iterateCodePoints(&nStartPos, 1);
220 : }
221 9649 : return nStartPos;
222 : }
223 :
224 31 : sal_Int32 SAL_CALL BreakIterator_Unicode::previousCharacters( const OUString& Text,
225 : sal_Int32 nStartPos, const lang::Locale& rLocale,
226 : sal_Int16 nCharacterIteratorMode, sal_Int32 nCount, sal_Int32& nDone )
227 : throw(uno::RuntimeException)
228 : {
229 31 : if (nCharacterIteratorMode == CharacterIteratorMode::SKIPCELL ) { // for CELL mode
230 10 : loadICUBreakIterator(rLocale, LOAD_CHARACTER_BREAKITERATOR, 0, "char", Text);
231 20 : for (nDone = 0; nDone < nCount; nDone++) {
232 10 : nStartPos = character.aBreakIterator->preceding(nStartPos);
233 10 : if (nStartPos == BreakIterator::DONE)
234 0 : return 0;
235 : }
236 : } else { // for BS to delete one char and CHARACTER mode.
237 42 : for (nDone = 0; nDone < nCount && nStartPos > 0; nDone++)
238 21 : Text.iterateCodePoints(&nStartPos, -1);
239 : }
240 31 : return nStartPos;
241 : }
242 :
243 :
244 15 : Boundary SAL_CALL BreakIterator_Unicode::nextWord( const OUString& Text, sal_Int32 nStartPos,
245 : const lang::Locale& rLocale, sal_Int16 rWordType ) throw(uno::RuntimeException)
246 : {
247 15 : loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
248 :
249 15 : result.startPos = icuBI->aBreakIterator->following(nStartPos);
250 15 : if( result.startPos >= Text.getLength() || result.startPos == BreakIterator::DONE )
251 1 : result.endPos = result.startPos;
252 : else {
253 28 : if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
254 : rWordType == WordType::DICTIONARY_WORD ) &&
255 14 : u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
256 0 : result.startPos = icuBI->aBreakIterator->following(result.startPos);
257 :
258 14 : result.endPos = icuBI->aBreakIterator->following(result.startPos);
259 14 : if(result.endPos == BreakIterator::DONE)
260 0 : result.endPos = result.startPos;
261 : }
262 15 : return result;
263 : }
264 :
265 :
266 10 : Boundary SAL_CALL BreakIterator_Unicode::previousWord(const OUString& Text, sal_Int32 nStartPos,
267 : const lang::Locale& rLocale, sal_Int16 rWordType) throw(uno::RuntimeException)
268 : {
269 10 : loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
270 :
271 10 : result.startPos = icuBI->aBreakIterator->preceding(nStartPos);
272 10 : if( result.startPos < 0 || result.startPos == BreakIterator::DONE)
273 0 : result.endPos = result.startPos;
274 : else {
275 20 : if ( (rWordType == WordType::ANYWORD_IGNOREWHITESPACES ||
276 : rWordType == WordType::DICTIONARY_WORD) &&
277 10 : u_isWhitespace(Text.iterateCodePoints(&result.startPos, 0)) )
278 0 : result.startPos = icuBI->aBreakIterator->preceding(result.startPos);
279 :
280 10 : result.endPos = icuBI->aBreakIterator->following(result.startPos);
281 10 : if(result.endPos == BreakIterator::DONE)
282 0 : result.endPos = result.startPos;
283 : }
284 10 : return result;
285 : }
286 :
287 :
288 5463 : Boundary SAL_CALL BreakIterator_Unicode::getWordBoundary( const OUString& Text, sal_Int32 nPos, const lang::Locale& rLocale,
289 : sal_Int16 rWordType, sal_Bool bDirection ) throw(uno::RuntimeException)
290 : {
291 5463 : loadICUBreakIterator(rLocale, LOAD_WORD_BREAKITERATOR, rWordType, NULL, Text);
292 5463 : sal_Int32 len = Text.getLength();
293 :
294 5463 : if(icuBI->aBreakIterator->isBoundary(nPos)) {
295 2589 : result.startPos = result.endPos = nPos;
296 2589 : if((bDirection || nPos == 0) && nPos < len) //forward
297 2589 : result.endPos = icuBI->aBreakIterator->following(nPos);
298 : else
299 0 : result.startPos = icuBI->aBreakIterator->preceding(nPos);
300 : } else {
301 2874 : if(nPos <= 0) {
302 0 : result.startPos = 0;
303 0 : result.endPos = len ? icuBI->aBreakIterator->following((sal_Int32)0) : 0;
304 2874 : } else if(nPos >= len) {
305 0 : result.startPos = icuBI->aBreakIterator->preceding(len);
306 0 : result.endPos = len;
307 : } else {
308 2874 : result.startPos = icuBI->aBreakIterator->preceding(nPos);
309 2874 : result.endPos = icuBI->aBreakIterator->following(nPos);
310 : }
311 : }
312 5463 : if (result.startPos == BreakIterator::DONE)
313 0 : result.startPos = result.endPos;
314 5463 : else if (result.endPos == BreakIterator::DONE)
315 0 : result.endPos = result.startPos;
316 :
317 5463 : return result;
318 : }
319 :
320 :
321 0 : sal_Int32 SAL_CALL BreakIterator_Unicode::beginOfSentence( const OUString& Text, sal_Int32 nStartPos,
322 : const lang::Locale &rLocale ) throw(uno::RuntimeException)
323 : {
324 0 : loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
325 :
326 0 : sal_Int32 len = Text.getLength();
327 0 : if (len > 0 && nStartPos == len)
328 0 : Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
329 0 : if (!sentence.aBreakIterator->isBoundary(nStartPos))
330 0 : nStartPos = sentence.aBreakIterator->preceding(nStartPos);
331 :
332 : // skip preceding space.
333 0 : sal_uInt32 ch = Text.iterateCodePoints(&nStartPos, 1);
334 0 : while (nStartPos < len && u_isWhitespace(ch)) ch = Text.iterateCodePoints(&nStartPos, 1);
335 0 : Text.iterateCodePoints(&nStartPos, -1);
336 :
337 0 : return nStartPos;
338 : }
339 :
340 5 : sal_Int32 SAL_CALL BreakIterator_Unicode::endOfSentence( const OUString& Text, sal_Int32 nStartPos,
341 : const lang::Locale &rLocale ) throw(uno::RuntimeException)
342 : {
343 5 : loadICUBreakIterator(rLocale, LOAD_SENTENCE_BREAKITERATOR, 0, "sent", Text);
344 :
345 5 : sal_Int32 len = Text.getLength();
346 5 : if (len > 0 && nStartPos == len)
347 0 : Text.iterateCodePoints(&nStartPos, -1); // issue #i27703# treat end position as part of last sentence
348 5 : nStartPos = sentence.aBreakIterator->following(nStartPos);
349 :
350 5 : sal_Int32 nPos=nStartPos;
351 5 : while (nPos > 0 && u_isWhitespace(Text.iterateCodePoints(&nPos, -1))) nStartPos=nPos;
352 :
353 5 : return nStartPos;
354 : }
355 :
356 4354 : LineBreakResults SAL_CALL BreakIterator_Unicode::getLineBreak(
357 : const OUString& Text, sal_Int32 nStartPos,
358 : const lang::Locale& rLocale, sal_Int32 nMinBreakPos,
359 : const LineBreakHyphenationOptions& hOptions,
360 : const LineBreakUserOptions& /*rOptions*/ ) throw(uno::RuntimeException)
361 : {
362 4354 : LineBreakResults lbr;
363 :
364 4354 : if (nStartPos >= Text.getLength()) {
365 0 : lbr.breakIndex = Text.getLength();
366 0 : lbr.breakType = BreakType::WORDBOUNDARY;
367 0 : return lbr;
368 : }
369 :
370 4354 : loadICUBreakIterator(rLocale, LOAD_LINE_BREAKITERATOR, 0, lineRule, Text);
371 :
372 4354 : sal_Bool GlueSpace=sal_True;
373 13062 : while (GlueSpace) {
374 4354 : if (line.aBreakIterator->preceding(nStartPos + 1) == nStartPos) { //Line boundary break
375 1739 : lbr.breakIndex = nStartPos;
376 1739 : lbr.breakType = BreakType::WORDBOUNDARY;
377 2615 : } else if (hOptions.rHyphenator.is()) { //Hyphenation break
378 : Boundary wBoundary = getWordBoundary( Text, nStartPos, rLocale,
379 1435 : WordType::DICTIONARY_WORD, false);
380 1435 : uno::Reference< linguistic2::XHyphenatedWord > aHyphenatedWord;
381 1435 : aHyphenatedWord = hOptions.rHyphenator->hyphenate(Text.copy(wBoundary.startPos,
382 : wBoundary.endPos - wBoundary.startPos), rLocale,
383 1435 : (sal_Int16) (hOptions.hyphenIndex - wBoundary.startPos), hOptions.aHyphenationOptions);
384 1435 : if (aHyphenatedWord.is()) {
385 0 : lbr.rHyphenatedWord = aHyphenatedWord;
386 0 : if(wBoundary.startPos + aHyphenatedWord->getHyphenationPos() + 1 < nMinBreakPos )
387 0 : lbr.breakIndex = -1;
388 : else
389 0 : lbr.breakIndex = wBoundary.startPos; //aHyphenatedWord->getHyphenationPos();
390 0 : lbr.breakType = BreakType::HYPHENATION;
391 : } else {
392 1435 : lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
393 1435 : lbr.breakType = BreakType::WORDBOUNDARY;;
394 1435 : }
395 : } else { //word boundary break
396 1180 : lbr.breakIndex = line.aBreakIterator->preceding(nStartPos);
397 1180 : lbr.breakType = BreakType::WORDBOUNDARY;
398 : }
399 :
400 : #define WJ 0x2060 // Word Joiner
401 4354 : GlueSpace=sal_False;
402 4354 : if (lbr.breakType == BreakType::WORDBOUNDARY) {
403 4354 : nStartPos = lbr.breakIndex;
404 4354 : if (Text[nStartPos--] == WJ)
405 0 : GlueSpace=sal_True;
406 11516 : while (nStartPos >= 0 &&
407 2106 : (u_isWhitespace(Text.iterateCodePoints(&nStartPos, 0)) || Text[nStartPos] == WJ)) {
408 702 : if (Text[nStartPos--] == WJ)
409 0 : GlueSpace=sal_True;
410 : }
411 4354 : if (GlueSpace && nStartPos < 0) {
412 0 : lbr.breakIndex = 0;
413 0 : break;
414 : }
415 : }
416 : }
417 :
418 4354 : return lbr;
419 : }
420 :
421 :
422 :
423 : OUString SAL_CALL
424 0 : BreakIterator_Unicode::getImplementationName(void) throw( uno::RuntimeException )
425 : {
426 0 : return OUString::createFromAscii(cBreakIterator);
427 : }
428 :
429 : sal_Bool SAL_CALL
430 0 : BreakIterator_Unicode::supportsService(const OUString& rServiceName) throw( uno::RuntimeException )
431 : {
432 0 : return !rServiceName.compareToAscii(cBreakIterator);
433 : }
434 :
435 : uno::Sequence< OUString > SAL_CALL
436 0 : BreakIterator_Unicode::getSupportedServiceNames(void) throw( uno::RuntimeException )
437 : {
438 0 : uno::Sequence< OUString > aRet(1);
439 0 : aRet[0] = OUString::createFromAscii(cBreakIterator);
440 0 : return aRet;
441 : }
442 :
443 : } } } }
444 :
445 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|