Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : */
9 :
10 : #include <cppuhelper/compbase1.hxx>
11 : #include <cppuhelper/bootstrap.hxx>
12 : #include <cppuhelper/basemutex.hxx>
13 : #include <com/sun/star/i18n/XBreakIterator.hpp>
14 : #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
15 : #include <com/sun/star/i18n/ScriptType.hpp>
16 : #include <com/sun/star/i18n/WordType.hpp>
17 : #include <unotest/bootstrapfixturebase.hxx>
18 :
19 : #include <unicode/uversion.h>
20 :
21 : #include <rtl/strbuf.hxx>
22 : #include <rtl/ustrbuf.hxx>
23 :
24 : #include <string.h>
25 :
26 : #include <stack>
27 :
28 : using namespace ::com::sun::star;
29 :
30 30 : class TestBreakIterator : public test::BootstrapFixtureBase
31 : {
32 : public:
33 : virtual void setUp() SAL_OVERRIDE;
34 : virtual void tearDown() SAL_OVERRIDE;
35 :
36 : void testLineBreaking();
37 : void testWordBoundaries();
38 : void testGraphemeIteration();
39 : void testWeak();
40 : void testAsian();
41 : void testThai();
42 : #if (U_ICU_VERSION_MAJOR_NUM > 51)
43 : void testLao();
44 : #endif
45 : #ifdef TODO
46 : void testNorthernThai();
47 : #endif
48 : void testKhmer();
49 : void testJapanese();
50 : void testChinese();
51 :
52 2 : CPPUNIT_TEST_SUITE(TestBreakIterator);
53 1 : CPPUNIT_TEST(testLineBreaking);
54 1 : CPPUNIT_TEST(testWordBoundaries);
55 1 : CPPUNIT_TEST(testGraphemeIteration);
56 1 : CPPUNIT_TEST(testWeak);
57 1 : CPPUNIT_TEST(testAsian);
58 1 : CPPUNIT_TEST(testThai);
59 : #if (U_ICU_VERSION_MAJOR_NUM > 51)
60 1 : CPPUNIT_TEST(testLao);
61 : #endif
62 : #ifdef TODO
63 : CPPUNIT_TEST(testNorthernThai);
64 : #endif
65 : #if (U_ICU_VERSION_MAJOR_NUM > 4)
66 1 : CPPUNIT_TEST(testKhmer);
67 : #endif
68 1 : CPPUNIT_TEST(testJapanese);
69 1 : CPPUNIT_TEST(testChinese);
70 5 : CPPUNIT_TEST_SUITE_END();
71 :
72 : private:
73 : uno::Reference<i18n::XBreakIterator> m_xBreak;
74 : void doTestJapanese(uno::Reference< i18n::XBreakIterator > &xBreak);
75 : };
76 :
77 1 : void TestBreakIterator::testLineBreaking()
78 : {
79 1 : i18n::LineBreakHyphenationOptions aHyphOptions;
80 2 : i18n::LineBreakUserOptions aUserOptions;
81 2 : lang::Locale aLocale;
82 :
83 : //See https://bugs.libreoffice.org/show_bug.cgi?id=31271
84 : {
85 1 : OUString aTest("(some text here)");
86 :
87 1 : aLocale.Language = "en";
88 1 : aLocale.Country = "US";
89 :
90 : {
91 : //Here we want the line break to leave text here) on the next line
92 1 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
93 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the word", aResult.breakIndex == 6);
94 : }
95 :
96 : {
97 : //Here we want the line break to leave "here)" on the next line
98 1 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
99 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the word", aResult.breakIndex == 11);
100 1 : }
101 : }
102 :
103 : //See https://bugs.libreoffice.org/show_bug.cgi?id=49849
104 : {
105 1 : const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD };
106 1 : OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1));
107 2 : OUString aTest(OUStringBuffer(aWord).append(' ').append(aWord).makeStringAndClear());
108 :
109 1 : aLocale.Language = "he";
110 1 : aLocale.Country = "IL";
111 :
112 : {
113 : //Here we want the line break to happen at the whitespace
114 1 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
115 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the word", aResult.breakIndex == aWord.getLength()+1);
116 1 : }
117 : }
118 :
119 : //See https://bz.apache.org/ooo/show_bug.cgi?id=17155
120 : {
121 1 : OUString aTest("foo /bar/baz");
122 :
123 1 : aLocale.Language = "en";
124 1 : aLocale.Country = "US";
125 :
126 : {
127 : //Here we want the line break to leave /bar/ba clumped together on the next line
128 1 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("foo /bar/ba"), aLocale, 0,
129 1 : aHyphOptions, aUserOptions);
130 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the first slash", aResult.breakIndex == 4);
131 1 : }
132 : }
133 :
134 : //See https://bz.apache.org/ooo/show_bug.cgi?id=19716
135 : {
136 1 : OUString aTest("aaa]aaa");
137 :
138 1 : aLocale.Language = "en";
139 1 : aLocale.Country = "US";
140 :
141 : {
142 : //Here we want the line break to move the whole lot to the next line
143 2 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
144 2 : aHyphOptions, aUserOptions);
145 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the line, not at ]", aResult.breakIndex == 0);
146 1 : }
147 1 : }
148 1 : }
149 :
150 : //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
151 1 : void TestBreakIterator::testWordBoundaries()
152 : {
153 1 : lang::Locale aLocale;
154 1 : aLocale.Language = "en";
155 1 : aLocale.Country = "US";
156 :
157 1 : i18n::Boundary aBounds;
158 :
159 : //See https://bz.apache.org/ooo/show_bug.cgi?id=11993
160 : {
161 1 : OUString aTest("abcd ef ghi??? KLM");
162 :
163 1 : CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
164 1 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
165 1 : aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
166 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
167 :
168 1 : CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
169 1 : CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
170 :
171 : //next word
172 1 : aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
173 1 : CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
174 :
175 : //previous word
176 1 : aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
177 1 : CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 7);
178 :
179 1 : CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
180 1 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
181 1 : aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
182 1 : CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
183 :
184 1 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
185 1 : CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
186 1 : aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
187 1 : CPPUNIT_ASSERT(aBounds.startPos == 16 && aBounds.endPos == 19);
188 : }
189 :
190 : //See https://bz.apache.org/ooo/show_bug.cgi?id=21907
191 : {
192 1 : OUString aTest("b a?");
193 :
194 1 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
195 1 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
196 1 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
197 :
198 1 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
199 :
200 1 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
201 1 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
202 1 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
203 :
204 1 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
205 : }
206 :
207 : //See https://bz.apache.org/ooo/show_bug.cgi?id=14904
208 : {
209 : const sal_Unicode TEST[] =
210 : {
211 : 'W', 'o', 'r', 'k', 'i', 'n', 'g', ' ', 0x201C, 'W', 'o', 'r', 'd', 's',
212 : ' ', 's', 't', 'a', 'r', 't', 'i', 'n', 'g', ' ', 'w', 'i', 't',
213 : 'h', ' ', 'q', 'u', 'o', 't', 'e', 's', 0x201D, ' ', 'W', 'o', 'r', 'k',
214 : 'i', 'n', 'g', ' ', 0x2018, 'B', 'r', 'o', 'k', 'e', 'n', 0x2019, ' ',
215 : '?', 'S', 'p', 'a', 'n', 'i', 's', 'h', '?', ' ', 'd', 'o', 'e',
216 : 's', 'n', 0x2019, 't', ' ', 'w', 'o', 'r', 'k', '.', ' ', 'N', 'o',
217 : 't', ' ', 'e', 'v', 'e', 'n', ' ' , 0x00BF, 'r', 'e', 'a', 'l', '?', ' ',
218 : 'S', 'p', 'a', 'n', 'i', 's', 'h'
219 1 : };
220 1 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
221 :
222 1 : aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
223 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
224 :
225 1 : aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
226 1 : CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 14);
227 :
228 1 : aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
229 1 : CPPUNIT_ASSERT(aBounds.startPos == 37 && aBounds.endPos == 44);
230 :
231 1 : aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
232 1 : CPPUNIT_ASSERT(aBounds.startPos == 46 && aBounds.endPos == 52);
233 :
234 1 : aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
235 1 : CPPUNIT_ASSERT(aBounds.startPos == 55 && aBounds.endPos == 62);
236 :
237 1 : aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
238 1 : CPPUNIT_ASSERT(aBounds.startPos == 64 && aBounds.endPos == 71);
239 :
240 1 : aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
241 1 : CPPUNIT_ASSERT(aBounds.startPos == 88 && aBounds.endPos == 92);
242 : }
243 :
244 : //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
245 1 : sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
246 5 : for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
247 : {
248 : //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
249 56 : for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i)
250 : {
251 : #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
252 : //Note the breakiterator test is known to fail on older icu
253 : //versions (4.2.1) for the 200B (ZWSP) Zero Width Space testcase.
254 : if (aBreakTests[i] == 0x200B)
255 : continue;
256 : #endif
257 52 : OUString aTest = "Word" + OUString(aBreakTests[i]) + "Word";
258 52 : aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
259 52 : switch (mode)
260 : {
261 : case i18n::WordType::ANY_WORD:
262 13 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
263 13 : break;
264 : case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
265 13 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
266 13 : break;
267 : case i18n::WordType::DICTIONARY_WORD:
268 13 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
269 13 : break;
270 : case i18n::WordType::WORD_COUNT:
271 13 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
272 13 : break;
273 : }
274 :
275 52 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
276 52 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
277 52 : }
278 : }
279 :
280 1 : sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
281 5 : for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
282 : {
283 : //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
284 36 : for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i)
285 : {
286 32 : OUString aTest = "Word" + OUString(aJoinTests[i]) + "Word";
287 32 : aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
288 32 : switch (mode)
289 : {
290 : case i18n::WordType::ANY_WORD:
291 8 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
292 8 : break;
293 : case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
294 8 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
295 8 : break;
296 : case i18n::WordType::DICTIONARY_WORD:
297 8 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
298 8 : break;
299 : case i18n::WordType::WORD_COUNT:
300 8 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
301 8 : break;
302 : }
303 :
304 32 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
305 32 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
306 32 : }
307 : }
308 :
309 : //See https://bz.apache.org/ooo/show_bug.cgi?id=13494
310 : {
311 1 : const OUString aBase("xxAAxxBBxxCCxx");
312 : const sal_Unicode aTests[] =
313 : {
314 : '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
315 : '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
316 : '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
317 1 : };
318 :
319 1 : const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
320 31 : for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j)
321 : {
322 30 : OUString aTest = aBase.replace('x', aTests[j]);
323 30 : sal_Int32 nPos = -1;
324 30 : size_t i = 0;
325 240 : do
326 : {
327 240 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions));
328 240 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
329 240 : CPPUNIT_ASSERT(nPos == aDoublePositions[i++]);
330 : }
331 240 : while (nPos < aTest.getLength());
332 30 : nPos = aTest.getLength();
333 30 : i = SAL_N_ELEMENTS(aDoublePositions)-1;
334 210 : do
335 : {
336 210 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
337 210 : CPPUNIT_ASSERT(nPos == aDoublePositions[--i]);
338 : }
339 : while (nPos > 0);
340 30 : }
341 :
342 1 : const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
343 30 : for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j)
344 : {
345 29 : OUString aTest = aBase.replaceAll(OUString("xx"), OUString(aTests[j]));
346 29 : sal_Int32 nPos = -1;
347 29 : size_t i = 0;
348 232 : do
349 : {
350 232 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions));
351 232 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
352 232 : CPPUNIT_ASSERT(nPos == aSinglePositions[i++]);
353 : }
354 232 : while (nPos < aTest.getLength());
355 29 : nPos = aTest.getLength();
356 29 : i = SAL_N_ELEMENTS(aSinglePositions)-1;
357 203 : do
358 : {
359 203 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
360 203 : CPPUNIT_ASSERT(nPos == aSinglePositions[--i]);
361 : }
362 : while (nPos > 0);
363 29 : }
364 :
365 1 : const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
366 1 : CPPUNIT_ASSERT(aTests[0] == '\'');
367 : {
368 1 : OUString aTest = aBase.replaceAll(OUString("xx"), OUString(aTests[0]));
369 1 : sal_Int32 nPos = -1;
370 1 : size_t i = 0;
371 4 : do
372 : {
373 4 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions));
374 4 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
375 4 : CPPUNIT_ASSERT(nPos == aSingleQuotePositions[i++]);
376 : }
377 4 : while (nPos < aTest.getLength());
378 1 : nPos = aTest.getLength();
379 1 : i = SAL_N_ELEMENTS(aSingleQuotePositions)-1;
380 3 : do
381 : {
382 3 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
383 3 : CPPUNIT_ASSERT(nPos == aSingleQuotePositions[--i]);
384 : }
385 1 : while (nPos > 0);
386 1 : }
387 : }
388 :
389 : //See https://bz.apache.org/ooo/show_bug.cgi?id=13451
390 : {
391 1 : aLocale.Language = "ca";
392 1 : aLocale.Country = "ES";
393 :
394 1 : OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!");
395 :
396 1 : sal_Int32 nPos = 0;
397 1 : sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
398 1 : size_t i = 0;
399 7 : do
400 : {
401 7 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
402 7 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
403 7 : i18n::WordType::DICTIONARY_WORD, true).endPos;
404 7 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
405 : }
406 7 : while (nPos++ < aTest.getLength());
407 1 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
408 : }
409 :
410 : //See https://bz.apache.org/ooo/show_bug.cgi?id=85411
411 4 : for (int j = 0; j < 3; ++j)
412 : {
413 3 : switch (j)
414 : {
415 : case 0:
416 1 : aLocale.Language = "en";
417 1 : aLocale.Country = "US";
418 1 : break;
419 : case 1:
420 1 : aLocale.Language = "ca";
421 1 : aLocale.Country = "ES";
422 1 : break;
423 : case 2:
424 1 : aLocale.Language = "fi";
425 1 : aLocale.Country = "FI";
426 1 : break;
427 : default:
428 0 : CPPUNIT_ASSERT(false);
429 0 : break;
430 : }
431 :
432 : const sal_Unicode TEST[] =
433 : {
434 : 'I', 0x200B, 'w', 'a', 'n', 't', 0x200B, 't', 'o', 0x200B, 'g', 'o'
435 3 : };
436 3 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
437 :
438 3 : sal_Int32 nPos = 0;
439 3 : sal_Int32 aExpected[] = {1, 6, 9, 12};
440 3 : size_t i = 0;
441 12 : do
442 : {
443 12 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
444 12 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
445 12 : i18n::WordType::DICTIONARY_WORD, true).endPos;
446 12 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
447 : }
448 12 : while (nPos++ < aTest.getLength());
449 3 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
450 3 : }
451 :
452 : //https://bz.apache.org/ooo/show_bug.cgi?id=21290
453 3 : for (int j = 0; j < 2; ++j)
454 : {
455 2 : switch (j)
456 : {
457 : case 0:
458 1 : aLocale.Language = "en";
459 1 : aLocale.Country = "US";
460 1 : break;
461 : case 1:
462 1 : aLocale.Language = "grc";
463 1 : aLocale.Country.clear();
464 1 : break;
465 : default:
466 0 : CPPUNIT_ASSERT(false);
467 0 : break;
468 : }
469 :
470 : const sal_Unicode TEST[] =
471 : {
472 : 0x1F0C, 0x03BD, 0x03B4, 0x03C1, 0x03B1, 0x0020, 0x1F00,
473 : 0x03C1, 0x03BD, 0x1F7B, 0x03BC, 0x03B5, 0x03BD, 0x03BF,
474 : 0x03C2, 0x0020, 0x1F00, 0x03BB, 0x03BB, 0x0020, 0x1F24,
475 : 0x03C3, 0x03B8, 0x03B9, 0x03BF, 0x03BD
476 2 : };
477 2 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
478 :
479 2 : sal_Int32 nPos = 0;
480 2 : sal_Int32 aExpected[] = {5, 15, 19, 26};
481 2 : size_t i = 0;
482 8 : do
483 : {
484 8 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
485 8 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
486 8 : i18n::WordType::DICTIONARY_WORD, true).endPos;
487 8 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
488 : }
489 8 : while (nPos++ < aTest.getLength());
490 2 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
491 2 : }
492 :
493 : //See https://bz.apache.org/ooo/show_bug.cgi?id=58513
494 : //See https://bugs.libreoffice.org/show_bug.cgi?id=55707
495 : {
496 1 : aLocale.Language = "fi";
497 1 : aLocale.Country = "FI";
498 :
499 1 : OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi USA:n 90:n %:n");
500 :
501 : {
502 1 : sal_Int32 nPos = 0;
503 1 : sal_Int32 aExpected[] = {11, 21, 24, 36, 42, 47, 51};
504 1 : size_t i = 0;
505 7 : do
506 : {
507 7 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
508 7 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
509 7 : i18n::WordType::WORD_COUNT, true).endPos;
510 7 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
511 : }
512 7 : while (nPos++ < aTest.getLength());
513 1 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
514 : }
515 :
516 : {
517 1 : sal_Int32 nPos = 0;
518 : sal_Int32 aExpected[] = {0, 11, 12, 20, 22, 24, 25, 36, 37,
519 1 : 40, 41, 42, 43, 45, 46, 47, 50, 51};
520 1 : size_t i = 0;
521 9 : do
522 : {
523 9 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
524 9 : aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
525 9 : i18n::WordType::DICTIONARY_WORD, true);
526 9 : CPPUNIT_ASSERT(aExpected[i++] == aBounds.startPos);
527 9 : CPPUNIT_ASSERT(aExpected[i++] == aBounds.endPos);
528 9 : nPos = aBounds.endPos;
529 : }
530 9 : while (nPos++ < aTest.getLength());
531 1 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
532 1 : }
533 : }
534 :
535 : //See https://bz.apache.org/ooo/show_bug.cgi?id=107843
536 : {
537 1 : aLocale.Language = "en";
538 1 : aLocale.Country = "US";
539 :
540 : const sal_Unicode TEST[] =
541 : {
542 : 'r', 'u', 0xFB00, 'l', 'e', ' ', 0xFB01, 's', 'h'
543 1 : };
544 1 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
545 :
546 1 : aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
547 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
548 :
549 1 : aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
550 1 : CPPUNIT_ASSERT(aBounds.startPos == 6 && aBounds.endPos == 9);
551 : }
552 :
553 : //See https://bz.apache.org/ooo/show_bug.cgi?id=113785
554 : {
555 1 : aLocale.Language = "en";
556 1 : aLocale.Country = "US";
557 :
558 : const sal_Unicode TEST[] =
559 : {
560 : 'a', 0x2013, 'b', 0x2014, 'c'
561 1 : };
562 1 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
563 :
564 1 : aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
565 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 1);
566 :
567 1 : aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
568 1 : CPPUNIT_ASSERT(aBounds.startPos == 2 && aBounds.endPos == 3);
569 :
570 1 : aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
571 1 : CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 5);
572 1 : }
573 1 : }
574 :
575 : //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
576 : //See https://bz.apache.org/ooo/show_bug.cgi?id=80412
577 : //See https://bz.apache.org/ooo/show_bug.cgi?id=111152
578 : //See https://bz.apache.org/ooo/show_bug.cgi?id=50172
579 1 : void TestBreakIterator::testGraphemeIteration()
580 : {
581 1 : lang::Locale aLocale;
582 1 : aLocale.Language = "bn";
583 1 : aLocale.Country = "IN";
584 :
585 : {
586 1 : const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
587 1 : OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
588 :
589 1 : sal_Int32 nDone=0;
590 : sal_Int32 nPos;
591 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
592 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
593 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(BA_HALANT_LA));
594 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale,
595 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
596 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
597 : }
598 :
599 : {
600 1 : const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
601 1 : OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
602 :
603 1 : sal_Int32 nDone=0;
604 : sal_Int32 nPos;
605 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
606 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
607 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
608 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale,
609 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
610 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
611 : }
612 :
613 : {
614 1 : const sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
615 1 : OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
616 :
617 1 : sal_Int32 nDone=0;
618 : sal_Int32 nPos;
619 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
620 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
621 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
622 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale,
623 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
624 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
625 : }
626 :
627 1 : aLocale.Language = "ta";
628 1 : aLocale.Country = "IN";
629 :
630 : {
631 1 : const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
632 1 : OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
633 :
634 1 : sal_Int32 nDone=0;
635 1 : sal_Int32 nPos = 0;
636 :
637 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
638 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
639 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(KA_VIRAMA_SSA));
640 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
641 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
642 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
643 : }
644 :
645 : {
646 1 : const sal_Unicode KA_VOWELSIGNU[] = { 0x0B95, 0x0BC1 };
647 1 : OUString aTest(KA_VOWELSIGNU, SAL_N_ELEMENTS(KA_VOWELSIGNU));
648 :
649 1 : sal_Int32 nDone=0;
650 1 : sal_Int32 nPos = 0;
651 :
652 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
653 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
654 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(KA_VOWELSIGNU));
655 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VOWELSIGNU), aLocale,
656 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
657 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
658 : }
659 :
660 : {
661 : const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] =
662 1 : { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
663 : OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI,
664 1 : SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
665 :
666 1 : sal_Int32 nDone=0;
667 1 : sal_Int32 nPos=0;
668 :
669 5 : for (sal_Int32 i = 0; i < 4; ++i)
670 : {
671 4 : sal_Int32 nOldPos = nPos;
672 4 : nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
673 4 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
674 4 : CPPUNIT_ASSERT_MESSAGE("Should skip 2 units", nPos == nOldPos+2);
675 : }
676 :
677 5 : for (sal_Int32 i = 0; i < 4; ++i)
678 : {
679 4 : sal_Int32 nOldPos = nPos;
680 4 : nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
681 4 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
682 4 : CPPUNIT_ASSERT_MESSAGE("Should skip 2 units", nPos == nOldPos-2);
683 1 : }
684 : }
685 :
686 : {
687 1 : const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
688 1 : OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
689 :
690 1 : sal_Int32 nGraphemeCount = 0;
691 :
692 1 : sal_Int32 nCurPos = 0;
693 3 : while (nCurPos < aText.getLength())
694 : {
695 1 : sal_Int32 nCount2 = 1;
696 1 : nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
697 1 : i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
698 1 : ++nGraphemeCount;
699 : }
700 :
701 1 : CPPUNIT_ASSERT_MESSAGE("Should be considered 1 grapheme", nGraphemeCount == 1);
702 : }
703 :
704 1 : aLocale.Language = "hi";
705 1 : aLocale.Country = "IN";
706 :
707 : {
708 1 : const sal_Unicode SHA_VOWELSIGNII[] = { 0x936, 0x940 };
709 1 : OUString aTest(SHA_VOWELSIGNII, SAL_N_ELEMENTS(SHA_VOWELSIGNII));
710 :
711 1 : sal_Int32 nDone=0;
712 1 : sal_Int32 nPos = 0;
713 :
714 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
715 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
716 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(SHA_VOWELSIGNII));
717 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(SHA_VOWELSIGNII), aLocale,
718 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
719 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
720 1 : }
721 1 : }
722 :
723 : //A test to ensure that certain ranges and codepoints that are categorized as
724 : //weak remain as weak, so that existing docs that depend on this don't silently
725 : //change font for those weak chars
726 1 : void TestBreakIterator::testWeak()
727 : {
728 1 : lang::Locale aLocale;
729 1 : aLocale.Language = "en";
730 1 : aLocale.Country = "US";
731 :
732 : {
733 : const sal_Unicode WEAKS[] =
734 : {
735 : 0x0001, 0x0002,
736 : 0x0020, 0x00A0,
737 : 0x2150, 0x215F, //Number Forms, fractions
738 : 0x2160, 0x2180, //Number Forms, roman numerals
739 : 0x2200, 0x22FF, //Mathematical Operators
740 : 0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
741 : 0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
742 : 0x2A00, 0x2AFF, //Supplemental Mathematical Operators
743 : 0x2100, 0x214F, //Letterlike Symbols
744 : 0x2308, 0x230B, //Miscellaneous technical
745 : 0x25A0, 0x25FF, //Geometric Shapes
746 : 0x2B30, 0x2B4C //Miscellaneous Symbols and Arrows
747 1 : };
748 1 : OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS));
749 :
750 25 : for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
751 : {
752 24 : sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
753 24 : OStringBuffer aMsg;
754 24 : aMsg.append("Char 0x");
755 24 : aMsg.append(static_cast<sal_Int32>(aWeaks[i]), 16);
756 24 : aMsg.append(" should have been weak");
757 48 : CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
758 24 : nScript == i18n::ScriptType::WEAK);
759 25 : }
760 1 : }
761 1 : }
762 :
763 : //A test to ensure that certain ranges and codepoints that are categorized as
764 : //asian remain as asian, so that existing docs that depend on this don't silently
765 : //change font for those asian chars.
766 : //See https://bugs.libreoffice.org/show_bug.cgi?id=38095
767 1 : void TestBreakIterator::testAsian()
768 : {
769 1 : lang::Locale aLocale;
770 1 : aLocale.Language = "en";
771 1 : aLocale.Country = "US";
772 :
773 : {
774 : const sal_Unicode ASIANS[] =
775 : {
776 : //some typical CJK chars
777 : 0x4E00, 0x62FF,
778 : //The full HalfWidth and FullWidth block has historically been
779 : //designated as taking the CJK font :-(
780 : //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
781 : //UAX24 as "Common" i.e. by that logic WEAK
782 : 0xFF10, 0xFF19,
783 : //HalfWidth and FullWidth forms of ASCII A-z, categorized under
784 : //UAX25 as "Latin", i.e. by that logic LATIN
785 : 0xFF21, 0xFF5A
786 1 : };
787 1 : OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
788 :
789 7 : for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
790 : {
791 6 : sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
792 6 : OStringBuffer aMsg;
793 6 : aMsg.append("Char 0x");
794 6 : aMsg.append(static_cast<sal_Int32>(aAsians[i]), 16);
795 6 : aMsg.append(" should have been asian");
796 12 : CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
797 6 : nScript == i18n::ScriptType::ASIAN);
798 7 : }
799 1 : }
800 1 : }
801 :
802 : #if (U_ICU_VERSION_MAJOR_NUM > 51)
803 : //A test to ensure that our Lao word boundary detection is useful
804 1 : void TestBreakIterator::testLao()
805 : {
806 1 : lang::Locale aLocale;
807 1 : aLocale.Language = "lo";
808 1 : aLocale.Country = "LA";
809 :
810 1 : const sal_Unicode LAO[] = { 0x0e8d, 0x0eb4, 0x0e99, 0x0e94, 0x0eb5, 0x0e95, 0x0ec9, 0x0ead, 0x0e99, 0x0eae, 0x0eb1, 0x0e9a };
811 2 : OUString aTest(LAO, SAL_N_ELEMENTS(LAO));
812 1 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
813 1 : i18n::WordType::DICTIONARY_WORD, true);
814 :
815 1 : CPPUNIT_ASSERT_EQUAL(sal_Int32(0), aBounds.startPos);
816 1 : CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.endPos);
817 :
818 1 : aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
819 1 : i18n::WordType::DICTIONARY_WORD, true);
820 :
821 1 : CPPUNIT_ASSERT_EQUAL(sal_Int32(5), aBounds.startPos);
822 2 : CPPUNIT_ASSERT_EQUAL(sal_Int32(9), aBounds.endPos);
823 1 : }
824 : #endif
825 :
826 : //A test to ensure that our thai word boundary detection is useful
827 1 : void TestBreakIterator::testThai()
828 : {
829 1 : lang::Locale aLocale;
830 1 : aLocale.Language = "th";
831 1 : aLocale.Country = "TH";
832 :
833 : //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
834 : {
835 1 : const sal_Unicode THAI[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
836 1 : OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
837 1 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
838 1 : i18n::WordType::DICTIONARY_WORD, true);
839 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full word",
840 2 : aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
841 : }
842 :
843 : //See https://bz.apache.org/ooo/show_bug.cgi?id=29548
844 : //make sure forwards and back are consistent
845 : {
846 : const sal_Unicode THAI[] =
847 : {
848 : 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
849 : 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
850 : 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27,
851 : 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
852 : 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
853 : 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27
854 1 : };
855 1 : OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
856 :
857 2 : std::stack<sal_Int32> aPositions;
858 1 : sal_Int32 nPos = -1;
859 11 : do
860 : {
861 11 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
862 11 : aPositions.push(nPos);
863 : }
864 11 : while (nPos < aTest.getLength());
865 1 : nPos = aTest.getLength();
866 1 : CPPUNIT_ASSERT(!aPositions.empty());
867 1 : aPositions.pop();
868 10 : do
869 : {
870 10 : CPPUNIT_ASSERT(!aPositions.empty());
871 10 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
872 10 : CPPUNIT_ASSERT(nPos == aPositions.top());
873 10 : aPositions.pop();
874 : }
875 11 : while (nPos > 0);
876 1 : }
877 1 : }
878 :
879 : #ifdef TODO
880 : void TestBreakIterator::testNorthernThai()
881 : {
882 : lang::Locale aLocale;
883 : aLocale.Language = "nod";
884 : aLocale.Country = "TH";
885 :
886 : const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
887 : OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
888 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
889 : i18n::WordType::DICTIONARY_WORD, true);
890 : CPPUNIT_ASSERT_MESSAGE("Should skip full word",
891 : aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
892 : }
893 : #endif
894 :
895 : #if (U_ICU_VERSION_MAJOR_NUM > 4)
896 : // Not sure if any version earlier than 49 did have Khmer word boundary
897 : // dictionaries, 4.6 does not.
898 :
899 : //A test to ensure that our khmer word boundary detection is useful
900 : //https://bugs.libreoffice.org/show_bug.cgi?id=52020
901 1 : void TestBreakIterator::testKhmer()
902 : {
903 1 : lang::Locale aLocale;
904 1 : aLocale.Language = "km";
905 1 : aLocale.Country = "KH";
906 :
907 1 : const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
908 :
909 2 : OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
910 1 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
911 1 : i18n::WordType::DICTIONARY_WORD, true);
912 :
913 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
914 :
915 1 : aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
916 1 : i18n::WordType::DICTIONARY_WORD, true);
917 :
918 2 : CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
919 1 : }
920 : #endif
921 :
922 2 : void TestBreakIterator::doTestJapanese(uno::Reference< i18n::XBreakIterator > &xBreak)
923 : {
924 2 : lang::Locale aLocale;
925 2 : aLocale.Language = "ja";
926 2 : aLocale.Country = "JP";
927 2 : i18n::Boundary aBounds;
928 :
929 : {
930 2 : const sal_Unicode JAPANESE[] = { 0x30B7, 0x30E3, 0x30C3, 0x30C8, 0x30C0, 0x30A6, 0x30F3 };
931 :
932 2 : OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
933 2 : aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
934 2 : i18n::WordType::DICTIONARY_WORD, true);
935 :
936 2 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
937 : }
938 :
939 : {
940 2 : const sal_Unicode JAPANESE[] = { 0x9EBB, 0x306E, 0x8449, 0x9EBB, 0x306E, 0x8449 };
941 :
942 2 : OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
943 2 : aBounds = xBreak->getWordBoundary(aTest, 1, aLocale,
944 2 : i18n::WordType::DICTIONARY_WORD, true);
945 :
946 2 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
947 :
948 2 : aBounds = xBreak->getWordBoundary(aTest, 5, aLocale,
949 2 : i18n::WordType::DICTIONARY_WORD, true);
950 :
951 2 : CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 6);
952 2 : }
953 2 : }
954 :
955 1 : void TestBreakIterator::testJapanese()
956 : {
957 1 : doTestJapanese(m_xBreak);
958 :
959 : // fdo#78479 - test second / cached instantiation of xdictionary
960 1 : uno::Reference< i18n::XBreakIterator > xTmpBreak(m_xSFactory->createInstance(
961 1 : "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
962 :
963 1 : doTestJapanese(xTmpBreak);
964 1 : }
965 :
966 1 : void TestBreakIterator::testChinese()
967 : {
968 1 : lang::Locale aLocale;
969 1 : aLocale.Language = "zh";
970 1 : aLocale.Country = "CN";
971 1 : i18n::Boundary aBounds;
972 :
973 : {
974 1 : const sal_Unicode CHINESE[] = { 0x6A35, 0x6A30, 0x69FE, 0x8919, 0xD867, 0xDEDB };
975 :
976 1 : OUString aTest(CHINESE, SAL_N_ELEMENTS(CHINESE));
977 1 : aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
978 1 : i18n::WordType::DICTIONARY_WORD, true);
979 1 : CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 6);
980 1 : }
981 1 : }
982 10 : void TestBreakIterator::setUp()
983 : {
984 10 : BootstrapFixtureBase::setUp();
985 30 : m_xBreak = uno::Reference< i18n::XBreakIterator >(m_xSFactory->createInstance(
986 20 : "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
987 10 : }
988 :
989 10 : void TestBreakIterator::tearDown()
990 : {
991 10 : m_xBreak.clear();
992 10 : BootstrapFixtureBase::tearDown();
993 10 : }
994 :
995 1 : CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
996 :
997 4 : CPPUNIT_PLUGIN_IMPLEMENT();
998 :
999 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|