Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : */
9 :
10 : #include <cppuhelper/compbase1.hxx>
11 : #include <cppuhelper/bootstrap.hxx>
12 : #include <cppuhelper/basemutex.hxx>
13 : #include <com/sun/star/i18n/XBreakIterator.hpp>
14 : #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
15 : #include <com/sun/star/i18n/ScriptType.hpp>
16 : #include <com/sun/star/i18n/WordType.hpp>
17 : #include <unotest/bootstrapfixturebase.hxx>
18 :
19 : #include <unicode/uversion.h>
20 :
21 : #include <rtl/strbuf.hxx>
22 : #include <rtl/ustrbuf.hxx>
23 :
24 : #include <string.h>
25 :
26 : #include <stack>
27 :
28 : using namespace ::com::sun::star;
29 :
30 60 : class TestBreakIterator : public test::BootstrapFixtureBase
31 : {
32 : public:
33 : virtual void setUp() SAL_OVERRIDE;
34 : virtual void tearDown() SAL_OVERRIDE;
35 :
36 : void testLineBreaking();
37 : void testWordBoundaries();
38 : void testGraphemeIteration();
39 : void testWeak();
40 : void testAsian();
41 : void testThai();
42 : void testLao();
43 : #ifdef TODO
44 : void testNorthernThai();
45 : #endif
46 : void testKhmer();
47 : void testJapanese();
48 : void testChinese();
49 4 : CPPUNIT_TEST_SUITE(TestBreakIterator);
50 2 : CPPUNIT_TEST(testLineBreaking);
51 2 : CPPUNIT_TEST(testGraphemeIteration);
52 2 : CPPUNIT_TEST(testWeak);
53 2 : CPPUNIT_TEST(testAsian);
54 2 : CPPUNIT_TEST(testThai);
55 : #ifdef TODO
56 : CPPUNIT_TEST(testNorthernThai);
57 : #endif
58 :
59 2 : CPPUNIT_TEST(testWordBoundaries);
60 : #if (U_ICU_VERSION_MAJOR_NUM > 4)
61 2 : CPPUNIT_TEST(testKhmer);
62 : #endif
63 : #if (U_ICU_VERSION_MAJOR_NUM > 51)
64 2 : CPPUNIT_TEST(testLao);
65 : #endif
66 2 : CPPUNIT_TEST(testJapanese);
67 2 : CPPUNIT_TEST(testChinese);
68 4 : CPPUNIT_TEST_SUITE_END();
69 : private:
70 : uno::Reference<i18n::XBreakIterator> m_xBreak;
71 : void doTestJapanese(uno::Reference< i18n::XBreakIterator > &xBreak);
72 : };
73 :
74 2 : void TestBreakIterator::testLineBreaking()
75 : {
76 2 : i18n::LineBreakHyphenationOptions aHyphOptions;
77 4 : i18n::LineBreakUserOptions aUserOptions;
78 4 : lang::Locale aLocale;
79 :
80 : //See https://bugs.libreoffice.org/show_bug.cgi?id=31271
81 : {
82 2 : OUString aTest("(some text here)");
83 :
84 2 : aLocale.Language = "en";
85 2 : aLocale.Country = "US";
86 :
87 : {
88 : //Here we want the line break to leave text here) on the next line
89 2 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
90 2 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the word", aResult.breakIndex == 6);
91 : }
92 :
93 : {
94 : //Here we want the line break to leave "here)" on the next line
95 2 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
96 2 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the word", aResult.breakIndex == 11);
97 2 : }
98 : }
99 :
100 : //See https://bugs.libreoffice.org/show_bug.cgi?id=49849
101 : {
102 2 : const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD };
103 2 : OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1));
104 4 : OUString aTest(OUStringBuffer(aWord).append(' ').append(aWord).makeStringAndClear());
105 :
106 2 : aLocale.Language = "he";
107 2 : aLocale.Country = "IL";
108 :
109 : {
110 : //Here we want the line break to happen at the whitespace
111 2 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
112 2 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the word", aResult.breakIndex == aWord.getLength()+1);
113 2 : }
114 : }
115 :
116 : //See https://issues.apache.org/ooo/show_bug.cgi?id=17155
117 : {
118 2 : OUString aTest("foo /bar/baz");
119 :
120 2 : aLocale.Language = "en";
121 2 : aLocale.Country = "US";
122 :
123 : {
124 : //Here we want the line break to leave /bar/ba clumped together on the next line
125 2 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("foo /bar/ba"), aLocale, 0,
126 2 : aHyphOptions, aUserOptions);
127 2 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the first slash", aResult.breakIndex == 4);
128 2 : }
129 : }
130 :
131 : //See https://issues.apache.org/ooo/show_bug.cgi?id=19716
132 : {
133 2 : OUString aTest("aaa]aaa");
134 :
135 2 : aLocale.Language = "en";
136 2 : aLocale.Country = "US";
137 :
138 : {
139 : //Here we want the line break to move the whole lot to the next line
140 4 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
141 4 : aHyphOptions, aUserOptions);
142 2 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the line, not at ]", aResult.breakIndex == 0);
143 2 : }
144 2 : }
145 2 : }
146 :
147 : //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
148 2 : void TestBreakIterator::testWordBoundaries()
149 : {
150 2 : lang::Locale aLocale;
151 2 : aLocale.Language = "en";
152 2 : aLocale.Country = "US";
153 :
154 2 : i18n::Boundary aBounds;
155 :
156 : //See https://issues.apache.org/ooo/show_bug.cgi?id=11993
157 : {
158 2 : OUString aTest("abcd ef ghi??? KLM");
159 :
160 2 : CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
161 2 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
162 2 : aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
163 2 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
164 :
165 2 : CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
166 2 : CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
167 :
168 : //next word
169 2 : aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
170 2 : CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
171 :
172 : //previous word
173 2 : aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
174 2 : CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 7);
175 :
176 2 : CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
177 2 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
178 2 : aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
179 2 : CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
180 :
181 2 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
182 2 : CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
183 2 : aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
184 2 : CPPUNIT_ASSERT(aBounds.startPos == 16 && aBounds.endPos == 19);
185 : }
186 :
187 : //See https://issues.apache.org/ooo/show_bug.cgi?id=21907
188 : {
189 2 : OUString aTest("b a?");
190 :
191 2 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
192 2 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
193 2 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
194 :
195 2 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
196 :
197 2 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
198 2 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
199 2 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
200 :
201 2 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
202 : }
203 :
204 : //See https://issues.apache.org/ooo/show_bug.cgi?id=14904
205 : {
206 : const sal_Unicode TEST[] =
207 : {
208 : 'W', 'o', 'r', 'k', 'i', 'n', 'g', ' ', 0x201C, 'W', 'o', 'r', 'd', 's',
209 : ' ', 's', 't', 'a', 'r', 't', 'i', 'n', 'g', ' ', 'w', 'i', 't',
210 : 'h', ' ', 'q', 'u', 'o', 't', 'e', 's', 0x201D, ' ', 'W', 'o', 'r', 'k',
211 : 'i', 'n', 'g', ' ', 0x2018, 'B', 'r', 'o', 'k', 'e', 'n', 0x2019, ' ',
212 : '?', 'S', 'p', 'a', 'n', 'i', 's', 'h', '?', ' ', 'd', 'o', 'e',
213 : 's', 'n', 0x2019, 't', ' ', 'w', 'o', 'r', 'k', '.', ' ', 'N', 'o',
214 : 't', ' ', 'e', 'v', 'e', 'n', ' ' , 0x00BF, 'r', 'e', 'a', 'l', '?', ' ',
215 : 'S', 'p', 'a', 'n', 'i', 's', 'h'
216 2 : };
217 2 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
218 :
219 2 : aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
220 2 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
221 :
222 2 : aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
223 2 : CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 14);
224 :
225 2 : aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
226 2 : CPPUNIT_ASSERT(aBounds.startPos == 37 && aBounds.endPos == 44);
227 :
228 2 : aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
229 2 : CPPUNIT_ASSERT(aBounds.startPos == 46 && aBounds.endPos == 52);
230 :
231 2 : aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
232 2 : CPPUNIT_ASSERT(aBounds.startPos == 55 && aBounds.endPos == 62);
233 :
234 2 : aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
235 2 : CPPUNIT_ASSERT(aBounds.startPos == 64 && aBounds.endPos == 71);
236 :
237 2 : aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
238 2 : CPPUNIT_ASSERT(aBounds.startPos == 88 && aBounds.endPos == 92);
239 : }
240 :
241 : //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
242 2 : sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
243 10 : for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
244 : {
245 : //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
246 112 : for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i)
247 : {
248 : #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
249 : //Note the breakiterator test is known to fail on older icu
250 : //versions (4.2.1) for the 200B (ZWSP) Zero Width Space testcase.
251 : if (aBreakTests[i] == 0x200B)
252 : continue;
253 : #endif
254 104 : OUString aTest = "Word" + OUString(aBreakTests[i]) + "Word";
255 104 : aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
256 104 : switch (mode)
257 : {
258 : case i18n::WordType::ANY_WORD:
259 26 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
260 26 : break;
261 : case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
262 26 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
263 26 : break;
264 : case i18n::WordType::DICTIONARY_WORD:
265 26 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
266 26 : break;
267 : case i18n::WordType::WORD_COUNT:
268 26 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
269 26 : break;
270 : }
271 :
272 104 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
273 104 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
274 104 : }
275 : }
276 :
277 2 : sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
278 10 : for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
279 : {
280 : //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
281 72 : for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i)
282 : {
283 64 : OUString aTest = "Word" + OUString(aJoinTests[i]) + "Word";
284 64 : aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
285 64 : switch (mode)
286 : {
287 : case i18n::WordType::ANY_WORD:
288 16 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
289 16 : break;
290 : case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
291 16 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
292 16 : break;
293 : case i18n::WordType::DICTIONARY_WORD:
294 16 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
295 16 : break;
296 : case i18n::WordType::WORD_COUNT:
297 16 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
298 16 : break;
299 : }
300 :
301 64 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
302 64 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
303 64 : }
304 : }
305 :
306 : //See https://issues.apache.org/ooo/show_bug.cgi?id=13494
307 : {
308 2 : const OUString aBase("xxAAxxBBxxCCxx");
309 : const sal_Unicode aTests[] =
310 : {
311 : '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
312 : '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
313 : '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
314 2 : };
315 :
316 2 : const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
317 62 : for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j)
318 : {
319 60 : OUString aTest = aBase.replace('x', aTests[j]);
320 60 : sal_Int32 nPos = -1;
321 60 : size_t i = 0;
322 480 : do
323 : {
324 480 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions));
325 480 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
326 480 : CPPUNIT_ASSERT(nPos == aDoublePositions[i++]);
327 : }
328 480 : while (nPos < aTest.getLength());
329 60 : nPos = aTest.getLength();
330 60 : i = SAL_N_ELEMENTS(aDoublePositions)-1;
331 420 : do
332 : {
333 420 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
334 420 : CPPUNIT_ASSERT(nPos == aDoublePositions[--i]);
335 : }
336 : while (nPos > 0);
337 60 : }
338 :
339 2 : const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
340 60 : for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j)
341 : {
342 58 : OUString aTest = aBase.replaceAll(OUString("xx"), OUString(aTests[j]));
343 58 : sal_Int32 nPos = -1;
344 58 : size_t i = 0;
345 464 : do
346 : {
347 464 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions));
348 464 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
349 464 : CPPUNIT_ASSERT(nPos == aSinglePositions[i++]);
350 : }
351 464 : while (nPos < aTest.getLength());
352 58 : nPos = aTest.getLength();
353 58 : i = SAL_N_ELEMENTS(aSinglePositions)-1;
354 406 : do
355 : {
356 406 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
357 406 : CPPUNIT_ASSERT(nPos == aSinglePositions[--i]);
358 : }
359 : while (nPos > 0);
360 58 : }
361 :
362 2 : const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
363 2 : CPPUNIT_ASSERT(aTests[0] == '\'');
364 : {
365 2 : OUString aTest = aBase.replaceAll(OUString("xx"), OUString(aTests[0]));
366 2 : sal_Int32 nPos = -1;
367 2 : size_t i = 0;
368 8 : do
369 : {
370 8 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions));
371 8 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
372 8 : CPPUNIT_ASSERT(nPos == aSingleQuotePositions[i++]);
373 : }
374 8 : while (nPos < aTest.getLength());
375 2 : nPos = aTest.getLength();
376 2 : i = SAL_N_ELEMENTS(aSingleQuotePositions)-1;
377 6 : do
378 : {
379 6 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
380 6 : CPPUNIT_ASSERT(nPos == aSingleQuotePositions[--i]);
381 : }
382 2 : while (nPos > 0);
383 2 : }
384 : }
385 :
386 : //See https://issues.apache.org/ooo/show_bug.cgi?id=13451
387 : {
388 2 : aLocale.Language = "ca";
389 2 : aLocale.Country = "ES";
390 :
391 2 : OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!");
392 :
393 2 : sal_Int32 nPos = 0;
394 2 : sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
395 2 : size_t i = 0;
396 14 : do
397 : {
398 14 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
399 14 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
400 14 : i18n::WordType::DICTIONARY_WORD, true).endPos;
401 14 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
402 : }
403 14 : while (nPos++ < aTest.getLength());
404 2 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
405 : }
406 :
407 : //See https://issues.apache.org/ooo/show_bug.cgi?id=85411
408 8 : for (int j = 0; j < 3; ++j)
409 : {
410 6 : switch (j)
411 : {
412 : case 0:
413 2 : aLocale.Language = "en";
414 2 : aLocale.Country = "US";
415 2 : break;
416 : case 1:
417 2 : aLocale.Language = "ca";
418 2 : aLocale.Country = "ES";
419 2 : break;
420 : case 2:
421 2 : aLocale.Language = "fi";
422 2 : aLocale.Country = "FI";
423 2 : break;
424 : default:
425 0 : CPPUNIT_ASSERT(false);
426 0 : break;
427 : }
428 :
429 : const sal_Unicode TEST[] =
430 : {
431 : 'I', 0x200B, 'w', 'a', 'n', 't', 0x200B, 't', 'o', 0x200B, 'g', 'o'
432 6 : };
433 6 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
434 :
435 6 : sal_Int32 nPos = 0;
436 6 : sal_Int32 aExpected[] = {1, 6, 9, 12};
437 6 : size_t i = 0;
438 24 : do
439 : {
440 24 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
441 24 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
442 24 : i18n::WordType::DICTIONARY_WORD, true).endPos;
443 24 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
444 : }
445 24 : while (nPos++ < aTest.getLength());
446 6 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
447 6 : }
448 :
449 : //https://issues.apache.org/ooo/show_bug.cgi?id=21290
450 6 : for (int j = 0; j < 2; ++j)
451 : {
452 4 : switch (j)
453 : {
454 : case 0:
455 2 : aLocale.Language = "en";
456 2 : aLocale.Country = "US";
457 2 : break;
458 : case 1:
459 2 : aLocale.Language = "grc";
460 2 : aLocale.Country = "";
461 2 : break;
462 : default:
463 0 : CPPUNIT_ASSERT(false);
464 0 : break;
465 : }
466 :
467 : const sal_Unicode TEST[] =
468 : {
469 : 0x1F0C, 0x03BD, 0x03B4, 0x03C1, 0x03B1, 0x0020, 0x1F00,
470 : 0x03C1, 0x03BD, 0x1F7B, 0x03BC, 0x03B5, 0x03BD, 0x03BF,
471 : 0x03C2, 0x0020, 0x1F00, 0x03BB, 0x03BB, 0x0020, 0x1F24,
472 : 0x03C3, 0x03B8, 0x03B9, 0x03BF, 0x03BD
473 4 : };
474 4 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
475 :
476 4 : sal_Int32 nPos = 0;
477 4 : sal_Int32 aExpected[] = {5, 15, 19, 26};
478 4 : size_t i = 0;
479 16 : do
480 : {
481 16 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
482 16 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
483 16 : i18n::WordType::DICTIONARY_WORD, true).endPos;
484 16 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
485 : }
486 16 : while (nPos++ < aTest.getLength());
487 4 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
488 4 : }
489 :
490 : //See https://issues.apache.org/ooo/show_bug.cgi?id=58513
491 : //See https://bugs.freedesktop.org/show_bug.cgi?id=55707
492 : {
493 2 : aLocale.Language = "fi";
494 2 : aLocale.Country = "FI";
495 :
496 2 : OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n");
497 :
498 : {
499 2 : sal_Int32 nPos = 0;
500 2 : sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
501 2 : size_t i = 0;
502 14 : do
503 : {
504 14 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
505 14 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
506 14 : i18n::WordType::WORD_COUNT, true).endPos;
507 14 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
508 : }
509 14 : while (nPos++ < aTest.getLength());
510 2 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
511 : }
512 :
513 : {
514 2 : sal_Int32 nPos = 0;
515 : sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
516 2 : 40, 41, 42, 43, 45, 46, 47, 50, 51};
517 2 : size_t i = 0;
518 18 : do
519 : {
520 18 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
521 18 : aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
522 18 : i18n::WordType::DICTIONARY_WORD, true);
523 18 : CPPUNIT_ASSERT(aExpected[i++] == aBounds.startPos);
524 18 : CPPUNIT_ASSERT(aExpected[i++] == aBounds.endPos);
525 18 : nPos = aBounds.endPos;
526 : }
527 18 : while (nPos++ < aTest.getLength());
528 2 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
529 2 : }
530 : }
531 :
532 : //See https://issues.apache.org/ooo/show_bug.cgi?id=107843
533 : {
534 2 : aLocale.Language = "en";
535 2 : aLocale.Country = "US";
536 :
537 : const sal_Unicode TEST[] =
538 : {
539 : 'r', 'u', 0xFB00, 'l', 'e', ' ', 0xFB01, 's', 'h'
540 2 : };
541 2 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
542 :
543 2 : aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
544 2 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
545 :
546 2 : aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
547 2 : CPPUNIT_ASSERT(aBounds.startPos == 6 && aBounds.endPos == 9);
548 : }
549 :
550 : //See https://issues.apache.org/ooo/show_bug.cgi?id=113785
551 : {
552 2 : aLocale.Language = "en";
553 2 : aLocale.Country = "US";
554 :
555 : const sal_Unicode TEST[] =
556 : {
557 : 'a', 0x2013, 'b', 0x2014, 'c'
558 2 : };
559 2 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
560 :
561 2 : aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
562 2 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 1);
563 :
564 2 : aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
565 2 : CPPUNIT_ASSERT(aBounds.startPos == 2 && aBounds.endPos == 3);
566 :
567 2 : aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
568 2 : CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 5);
569 2 : }
570 2 : }
571 :
572 : //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
573 : //See https://issues.apache.org/ooo/show_bug.cgi?id=80412
574 : //See https://issues.apache.org/ooo/show_bug.cgi?id=111152
575 : //See https://issues.apache.org/ooo/show_bug.cgi?id=50172
576 2 : void TestBreakIterator::testGraphemeIteration()
577 : {
578 2 : lang::Locale aLocale;
579 2 : aLocale.Language = "bn";
580 2 : aLocale.Country = "IN";
581 :
582 : {
583 2 : const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
584 2 : OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
585 :
586 2 : sal_Int32 nDone=0;
587 : sal_Int32 nPos;
588 2 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
589 2 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
590 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(BA_HALANT_LA));
591 2 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale,
592 2 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
593 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
594 : }
595 :
596 : {
597 2 : const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
598 2 : OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
599 :
600 2 : sal_Int32 nDone=0;
601 : sal_Int32 nPos;
602 2 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
603 2 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
604 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
605 2 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale,
606 2 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
607 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
608 : }
609 :
610 : {
611 2 : const sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
612 2 : OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
613 :
614 2 : sal_Int32 nDone=0;
615 : sal_Int32 nPos;
616 2 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
617 2 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
618 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
619 2 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale,
620 2 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
621 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
622 : }
623 :
624 2 : aLocale.Language = "ta";
625 2 : aLocale.Country = "IN";
626 :
627 : {
628 2 : const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
629 2 : OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
630 :
631 2 : sal_Int32 nDone=0;
632 2 : sal_Int32 nPos = 0;
633 :
634 2 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
635 2 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
636 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(KA_VIRAMA_SSA));
637 2 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
638 2 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
639 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
640 : }
641 :
642 : {
643 2 : const sal_Unicode KA_VOWELSIGNU[] = { 0x0B95, 0x0BC1 };
644 2 : OUString aTest(KA_VOWELSIGNU, SAL_N_ELEMENTS(KA_VOWELSIGNU));
645 :
646 2 : sal_Int32 nDone=0;
647 2 : sal_Int32 nPos = 0;
648 :
649 2 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
650 2 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
651 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(KA_VOWELSIGNU));
652 2 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VOWELSIGNU), aLocale,
653 2 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
654 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
655 : }
656 :
657 : {
658 : const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] =
659 2 : { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
660 : OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI,
661 2 : SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
662 :
663 2 : sal_Int32 nDone=0;
664 2 : sal_Int32 nPos=0;
665 :
666 10 : for (sal_Int32 i = 0; i < 4; ++i)
667 : {
668 8 : sal_Int32 nOldPos = nPos;
669 8 : nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
670 8 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
671 8 : CPPUNIT_ASSERT_MESSAGE("Should skip 2 units", nPos == nOldPos+2);
672 : }
673 :
674 10 : for (sal_Int32 i = 0; i < 4; ++i)
675 : {
676 8 : sal_Int32 nOldPos = nPos;
677 8 : nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
678 8 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
679 8 : CPPUNIT_ASSERT_MESSAGE("Should skip 2 units", nPos == nOldPos-2);
680 2 : }
681 : }
682 :
683 : {
684 2 : const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
685 2 : OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
686 :
687 2 : sal_Int32 nGraphemeCount = 0;
688 :
689 2 : sal_Int32 nCurPos = 0;
690 6 : while (nCurPos < aText.getLength())
691 : {
692 2 : sal_Int32 nCount2 = 1;
693 2 : nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
694 2 : i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
695 2 : ++nGraphemeCount;
696 : }
697 :
698 2 : CPPUNIT_ASSERT_MESSAGE("Should be considered 1 grapheme", nGraphemeCount == 1);
699 : }
700 :
701 2 : aLocale.Language = "hi";
702 2 : aLocale.Country = "IN";
703 :
704 : {
705 2 : const sal_Unicode SHA_VOWELSIGNII[] = { 0x936, 0x940 };
706 2 : OUString aTest(SHA_VOWELSIGNII, SAL_N_ELEMENTS(SHA_VOWELSIGNII));
707 :
708 2 : sal_Int32 nDone=0;
709 2 : sal_Int32 nPos = 0;
710 :
711 2 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
712 2 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
713 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(SHA_VOWELSIGNII));
714 2 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(SHA_VOWELSIGNII), aLocale,
715 2 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
716 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
717 2 : }
718 2 : }
719 :
720 : //A test to ensure that certain ranges and codepoints that are categorized as
721 : //weak remain as weak, so that existing docs that depend on this don't silently
722 : //change font for those weak chars
723 2 : void TestBreakIterator::testWeak()
724 : {
725 2 : lang::Locale aLocale;
726 2 : aLocale.Language = "en";
727 2 : aLocale.Country = "US";
728 :
729 : {
730 : const sal_Unicode WEAKS[] =
731 : {
732 : 0x0001, 0x0002,
733 : 0x0020, 0x00A0,
734 : 0x2150, 0x215F, //Number Forms, fractions
735 : 0x2160, 0x2180, //Number Forms, roman numerals
736 : 0x2200, 0x22FF, //Mathematical Operators
737 : 0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
738 : 0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
739 : 0x2A00, 0x2AFF, //Supplemental Mathematical Operators
740 : 0x2100, 0x214F, //Letterlike Symbols
741 : 0x2308, 0x230B, //Miscellaneous technical
742 : 0x25A0, 0x25FF, //Geometric Shapes
743 : 0x2B30, 0x2B4C //Miscellaneous Symbols and Arrows
744 2 : };
745 2 : OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS));
746 :
747 50 : for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
748 : {
749 48 : sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
750 48 : OStringBuffer aMsg;
751 48 : aMsg.append("Char 0x");
752 48 : aMsg.append(static_cast<sal_Int32>(aWeaks[i]), 16);
753 48 : aMsg.append(" should have been weak");
754 96 : CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
755 48 : nScript == i18n::ScriptType::WEAK);
756 50 : }
757 2 : }
758 2 : }
759 :
760 : //A test to ensure that certain ranges and codepoints that are categorized as
761 : //asian remain as asian, so that existing docs that depend on this don't silently
762 : //change font for those asian chars.
763 : //See https://bugs.libreoffice.org/show_bug.cgi?id=38095
764 2 : void TestBreakIterator::testAsian()
765 : {
766 2 : lang::Locale aLocale;
767 2 : aLocale.Language = "en";
768 2 : aLocale.Country = "US";
769 :
770 : {
771 : const sal_Unicode ASIANS[] =
772 : {
773 : //some typical CJK chars
774 : 0x4E00, 0x62FF,
775 : //The full HalfWidth and FullWidth block has historically been
776 : //designated as taking the CJK font :-(
777 : //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
778 : //UAX24 as "Common" i.e. by that logic WEAK
779 : 0xFF10, 0xFF19,
780 : //HalfWidth and FullWidth forms of ASCII A-z, categorized under
781 : //UAX25 as "Latin", i.e. by that logic LATIN
782 : 0xFF21, 0xFF5A
783 2 : };
784 2 : OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
785 :
786 14 : for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
787 : {
788 12 : sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
789 12 : OStringBuffer aMsg;
790 12 : aMsg.append("Char 0x");
791 12 : aMsg.append(static_cast<sal_Int32>(aAsians[i]), 16);
792 12 : aMsg.append(" should have been asian");
793 24 : CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
794 12 : nScript == i18n::ScriptType::ASIAN);
795 14 : }
796 2 : }
797 2 : }
798 :
799 : //A test to ensure that our Lao word boundary detection is useful
800 2 : void TestBreakIterator::testLao()
801 : {
802 2 : lang::Locale aLocale;
803 2 : aLocale.Language = "lo";
804 2 : aLocale.Country = "LA";
805 :
806 2 : const sal_Unicode LAO[] = { 0x0e8d, 0x0eb4, 0x0e99, 0x0e94, 0x0eb5, 0x0e95, 0x0ec9, 0x0ead, 0x0e99, 0x0eae, 0x0eb1, 0x0e9a };
807 4 : OUString aTest(LAO, SAL_N_ELEMENTS(LAO));
808 2 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
809 2 : i18n::WordType::DICTIONARY_WORD, true);
810 :
811 2 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
812 :
813 2 : aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
814 2 : i18n::WordType::DICTIONARY_WORD, true);
815 :
816 4 : CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 9);
817 :
818 2 : }
819 :
820 : //A test to ensure that our thai word boundary detection is useful
821 2 : void TestBreakIterator::testThai()
822 : {
823 2 : lang::Locale aLocale;
824 2 : aLocale.Language = "th";
825 2 : aLocale.Country = "TH";
826 :
827 : //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
828 : {
829 2 : const sal_Unicode THAI[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
830 2 : OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
831 2 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
832 2 : i18n::WordType::DICTIONARY_WORD, true);
833 4 : CPPUNIT_ASSERT_MESSAGE("Should skip full word",
834 4 : aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
835 : }
836 :
837 : //See https://issues.apache.org/ooo/show_bug.cgi?id=29548
838 : //make sure forwards and back are consistent
839 : {
840 : const sal_Unicode THAI[] =
841 : {
842 : 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
843 : 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
844 : 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27,
845 : 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
846 : 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
847 : 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27
848 2 : };
849 2 : OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
850 :
851 4 : std::stack<sal_Int32> aPositions;
852 2 : sal_Int32 nPos = -1;
853 22 : do
854 : {
855 22 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
856 22 : aPositions.push(nPos);
857 : }
858 22 : while (nPos < aTest.getLength());
859 2 : nPos = aTest.getLength();
860 2 : CPPUNIT_ASSERT(!aPositions.empty());
861 2 : aPositions.pop();
862 20 : do
863 : {
864 20 : CPPUNIT_ASSERT(!aPositions.empty());
865 20 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
866 20 : CPPUNIT_ASSERT(nPos == aPositions.top());
867 20 : aPositions.pop();
868 : }
869 22 : while (nPos > 0);
870 2 : }
871 2 : }
872 :
873 : #ifdef TODO
874 : void TestBreakIterator::testNorthernThai()
875 : {
876 : lang::Locale aLocale;
877 : aLocale.Language = "nod";
878 : aLocale.Country = "TH";
879 :
880 : const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
881 : OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
882 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
883 : i18n::WordType::DICTIONARY_WORD, true);
884 : CPPUNIT_ASSERT_MESSAGE("Should skip full word",
885 : aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
886 : }
887 : #endif
888 :
889 : #if (U_ICU_VERSION_MAJOR_NUM > 4)
890 : // Not sure if any version earlier than 49 did have Khmer word boundary
891 : // dictionaries, 4.6 does not.
892 :
893 : //A test to ensure that our khmer word boundary detection is useful
894 : //https://bugs.libreoffice.org/show_bug.cgi?id=52020
895 2 : void TestBreakIterator::testKhmer()
896 : {
897 2 : lang::Locale aLocale;
898 2 : aLocale.Language = "km";
899 2 : aLocale.Country = "KH";
900 :
901 2 : const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
902 :
903 4 : OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
904 2 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
905 2 : i18n::WordType::DICTIONARY_WORD, true);
906 :
907 2 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
908 :
909 2 : aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
910 2 : i18n::WordType::DICTIONARY_WORD, true);
911 :
912 4 : CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
913 2 : }
914 : #endif
915 :
916 4 : void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > &xBreak)
917 : {
918 4 : lang::Locale aLocale;
919 4 : aLocale.Language = "ja";
920 4 : aLocale.Country = "JP";
921 4 : i18n::Boundary aBounds;
922 :
923 : {
924 4 : const sal_Unicode JAPANESE[] = { 0x30B7, 0x30E3, 0x30C3, 0x30C8, 0x30C0, 0x30A6, 0x30F3 };
925 :
926 4 : OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
927 4 : aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
928 4 : i18n::WordType::DICTIONARY_WORD, true);
929 :
930 4 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
931 : }
932 :
933 : {
934 4 : const sal_Unicode JAPANESE[] = { 0x9EBB, 0x306E, 0x8449, 0x9EBB, 0x306E, 0x8449 };
935 :
936 4 : OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
937 4 : aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
938 4 : i18n::WordType::DICTIONARY_WORD, true);
939 :
940 4 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
941 :
942 4 : aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
943 4 : i18n::WordType::DICTIONARY_WORD, true);
944 :
945 4 : CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 6);
946 4 : }
947 4 : }
948 :
949 2 : void TestBreakIterator::testJapanese()
950 : {
951 2 : doTestJapanese(m_xBreak);
952 :
953 : // fdo#78479 - test second / cached instantiation of xdictionary
954 2 : uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
955 2 : "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
956 :
957 2 : doTestJapanese(xTmpBreak);
958 2 : }
959 :
960 2 : void TestBreakIterator::testChinese()
961 : {
962 2 : lang::Locale aLocale;
963 2 : aLocale.Language = "zh";
964 2 : aLocale.Country = "CN";
965 2 : i18n::Boundary aBounds;
966 :
967 : {
968 2 : const sal_Unicode CHINESE[] = { 0x6A35, 0x6A30, 0x69FE, 0x8919, 0xD867, 0xDEDB };
969 :
970 2 : OUString aTest(CHINESE, SAL_N_ELEMENTS(CHINESE));
971 2 : aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
972 2 : i18n::WordType::DICTIONARY_WORD, true);
973 2 : CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 6);
974 2 : }
975 2 : }
976 20 : void TestBreakIterator::setUp()
977 : {
978 20 : BootstrapFixtureBase::setUp();
979 60 : m_xBreak = uno::Reference< i18n::XBreakIterator >(m_xSFactory->createInstance(
980 40 : "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
981 20 : }
982 :
983 20 : void TestBreakIterator::tearDown()
984 : {
985 20 : m_xBreak.clear();
986 20 : BootstrapFixtureBase::tearDown();
987 20 : }
988 :
989 2 : CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
990 :
991 8 : CPPUNIT_PLUGIN_IMPLEMENT();
992 :
993 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|