Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : */
9 :
10 : #include <cppuhelper/compbase1.hxx>
11 : #include <cppuhelper/bootstrap.hxx>
12 : #include <cppuhelper/basemutex.hxx>
13 : #include <com/sun/star/i18n/XBreakIterator.hpp>
14 : #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
15 : #include <com/sun/star/i18n/ScriptType.hpp>
16 : #include <com/sun/star/i18n/WordType.hpp>
17 : #include <unotest/bootstrapfixturebase.hxx>
18 :
19 : #include <unicode/uversion.h>
20 :
21 : #include <rtl/strbuf.hxx>
22 : #include <rtl/ustrbuf.hxx>
23 :
24 : #include <string.h>
25 :
26 : #include <stack>
27 :
28 : using namespace ::com::sun::star;
29 :
30 27 : class TestBreakIterator : public test::BootstrapFixtureBase
31 : {
32 : public:
33 : virtual void setUp() SAL_OVERRIDE;
34 : virtual void tearDown() SAL_OVERRIDE;
35 :
36 : void testLineBreaking();
37 : void testWordBoundaries();
38 : void testGraphemeIteration();
39 : void testWeak();
40 : void testAsian();
41 : void testThai();
42 : void testLao();
43 : #ifdef TODO
44 : void testNorthernThai();
45 : #endif
46 : void testKhmer();
47 : void testJapanese();
48 : void testChinese();
49 2 : CPPUNIT_TEST_SUITE(TestBreakIterator);
50 1 : CPPUNIT_TEST(testLineBreaking);
51 1 : CPPUNIT_TEST(testGraphemeIteration);
52 1 : CPPUNIT_TEST(testWeak);
53 1 : CPPUNIT_TEST(testAsian);
54 1 : CPPUNIT_TEST(testThai);
55 : #ifdef TODO
56 : CPPUNIT_TEST(testNorthernThai);
57 : #endif
58 :
59 1 : CPPUNIT_TEST(testWordBoundaries);
60 : #if (U_ICU_VERSION_MAJOR_NUM > 4)
61 1 : CPPUNIT_TEST(testKhmer);
62 : #endif
63 : #if (U_ICU_VERSION_MAJOR_NUM > 51)
64 : CPPUNIT_TEST(testLao);
65 : #endif
66 1 : CPPUNIT_TEST(testJapanese);
67 1 : CPPUNIT_TEST(testChinese);
68 2 : CPPUNIT_TEST_SUITE_END();
69 : private:
70 : uno::Reference<i18n::XBreakIterator> m_xBreak;
71 : };
72 :
73 1 : void TestBreakIterator::testLineBreaking()
74 : {
75 1 : i18n::LineBreakHyphenationOptions aHyphOptions;
76 2 : i18n::LineBreakUserOptions aUserOptions;
77 2 : lang::Locale aLocale;
78 :
79 : //See https://bugs.libreoffice.org/show_bug.cgi?id=31271
80 : {
81 1 : OUString aTest("(some text here)");
82 :
83 1 : aLocale.Language = "en";
84 1 : aLocale.Country = "US";
85 :
86 : {
87 : //Here we want the line break to leave text here) on the next line
88 1 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
89 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the word", aResult.breakIndex == 6);
90 : }
91 :
92 : {
93 : //Here we want the line break to leave "here)" on the next line
94 1 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
95 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the word", aResult.breakIndex == 11);
96 1 : }
97 : }
98 :
99 : //See https://bugs.libreoffice.org/show_bug.cgi?id=49849
100 : {
101 1 : const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD };
102 1 : OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1));
103 2 : OUString aTest(OUStringBuffer(aWord).append(' ').append(aWord).makeStringAndClear());
104 :
105 1 : aLocale.Language = "he";
106 1 : aLocale.Country = "IL";
107 :
108 : {
109 : //Here we want the line break to happen at the whitespace
110 1 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
111 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the word", aResult.breakIndex == aWord.getLength()+1);
112 1 : }
113 : }
114 :
115 : //See https://issues.apache.org/ooo/show_bug.cgi?id=17155
116 : {
117 1 : OUString aTest("foo /bar/baz");
118 :
119 1 : aLocale.Language = "en";
120 1 : aLocale.Country = "US";
121 :
122 : {
123 : //Here we want the line break to leave /bar/ba clumped together on the next line
124 1 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("foo /bar/ba"), aLocale, 0,
125 1 : aHyphOptions, aUserOptions);
126 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the first slash", aResult.breakIndex == 4);
127 1 : }
128 : }
129 :
130 : //See https://issues.apache.org/ooo/show_bug.cgi?id=19716
131 : {
132 1 : OUString aTest("aaa]aaa");
133 :
134 1 : aLocale.Language = "en";
135 1 : aLocale.Country = "US";
136 :
137 : {
138 : //Here we want the line break to move the whole lot to the next line
139 2 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
140 2 : aHyphOptions, aUserOptions);
141 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the line, not at ]", aResult.breakIndex == 0);
142 1 : }
143 1 : }
144 1 : }
145 :
146 : //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
147 1 : void TestBreakIterator::testWordBoundaries()
148 : {
149 1 : lang::Locale aLocale;
150 1 : aLocale.Language = "en";
151 1 : aLocale.Country = "US";
152 :
153 1 : i18n::Boundary aBounds;
154 :
155 : //See https://issues.apache.org/ooo/show_bug.cgi?id=11993
156 : {
157 1 : OUString aTest("abcd ef ghi??? KLM");
158 :
159 1 : CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
160 1 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
161 1 : aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
162 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
163 :
164 1 : CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
165 1 : CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
166 :
167 : //next word
168 1 : aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
169 1 : CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
170 :
171 : //previous word
172 1 : aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
173 1 : CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 7);
174 :
175 1 : CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
176 1 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
177 1 : aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
178 1 : CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
179 :
180 1 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
181 1 : CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
182 1 : aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
183 1 : CPPUNIT_ASSERT(aBounds.startPos == 16 && aBounds.endPos == 19);
184 : }
185 :
186 : //See https://issues.apache.org/ooo/show_bug.cgi?id=21907
187 : {
188 1 : OUString aTest("b a?");
189 :
190 1 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
191 1 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
192 1 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
193 :
194 1 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
195 :
196 1 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
197 1 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
198 1 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
199 :
200 1 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
201 : }
202 :
203 : //See https://issues.apache.org/ooo/show_bug.cgi?id=14904
204 : {
205 : const sal_Unicode TEST[] =
206 : {
207 : 'W', 'o', 'r', 'k', 'i', 'n', 'g', ' ', 0x201C, 'W', 'o', 'r', 'd', 's',
208 : ' ', 's', 't', 'a', 'r', 't', 'i', 'n', 'g', ' ', 'w', 'i', 't',
209 : 'h', ' ', 'q', 'u', 'o', 't', 'e', 's', 0x201D, ' ', 'W', 'o', 'r', 'k',
210 : 'i', 'n', 'g', ' ', 0x2018, 'B', 'r', 'o', 'k', 'e', 'n', 0x2019, ' ',
211 : '?', 'S', 'p', 'a', 'n', 'i', 's', 'h', '?', ' ', 'd', 'o', 'e',
212 : 's', 'n', 0x2019, 't', ' ', 'w', 'o', 'r', 'k', '.', ' ', 'N', 'o',
213 : 't', ' ', 'e', 'v', 'e', 'n', ' ' , 0x00BF, 'r', 'e', 'a', 'l', '?', ' ',
214 : 'S', 'p', 'a', 'n', 'i', 's', 'h'
215 1 : };
216 1 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
217 :
218 1 : aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
219 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
220 :
221 1 : aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
222 1 : CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 14);
223 :
224 1 : aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
225 1 : CPPUNIT_ASSERT(aBounds.startPos == 37 && aBounds.endPos == 44);
226 :
227 1 : aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
228 1 : CPPUNIT_ASSERT(aBounds.startPos == 46 && aBounds.endPos == 52);
229 :
230 1 : aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
231 1 : CPPUNIT_ASSERT(aBounds.startPos == 55 && aBounds.endPos == 62);
232 :
233 1 : aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
234 1 : CPPUNIT_ASSERT(aBounds.startPos == 64 && aBounds.endPos == 71);
235 :
236 1 : aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
237 1 : CPPUNIT_ASSERT(aBounds.startPos == 88 && aBounds.endPos == 92);
238 : }
239 :
240 : //See https://bugs.libreoffice.org/show_bug.cgi?id=49629
241 1 : sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
242 5 : for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
243 : {
244 : //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
245 56 : for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i)
246 : {
247 : #if (U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM <= 2)
248 : //Note the breakiterator test is known to fail on older icu
249 : //versions (4.2.1) for the 200B (ZWSP) Zero Width Space testcase.
250 : if (aBreakTests[i] == 0x200B)
251 : continue;
252 : #endif
253 52 : OUString aTest = "Word" + OUString(aBreakTests[i]) + "Word";
254 52 : aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
255 52 : switch (mode)
256 : {
257 : case i18n::WordType::ANY_WORD:
258 13 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
259 13 : break;
260 : case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
261 13 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
262 13 : break;
263 : case i18n::WordType::DICTIONARY_WORD:
264 13 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
265 13 : break;
266 : case i18n::WordType::WORD_COUNT:
267 13 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
268 13 : break;
269 : }
270 :
271 52 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
272 52 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
273 52 : }
274 : }
275 :
276 1 : sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
277 5 : for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
278 : {
279 : //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
280 36 : for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i)
281 : {
282 32 : OUString aTest = "Word" + OUString(aJoinTests[i]) + "Word";
283 32 : aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
284 32 : switch (mode)
285 : {
286 : case i18n::WordType::ANY_WORD:
287 8 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
288 8 : break;
289 : case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
290 8 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
291 8 : break;
292 : case i18n::WordType::DICTIONARY_WORD:
293 8 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
294 8 : break;
295 : case i18n::WordType::WORD_COUNT:
296 8 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
297 8 : break;
298 : }
299 :
300 32 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
301 32 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
302 32 : }
303 : }
304 :
305 : //See https://issues.apache.org/ooo/show_bug.cgi?id=13494
306 : {
307 1 : const OUString aBase("xxAAxxBBxxCCxx");
308 : const sal_Unicode aTests[] =
309 : {
310 : '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
311 : '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
312 : '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
313 1 : };
314 :
315 1 : const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
316 31 : for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j)
317 : {
318 30 : OUString aTest = aBase.replace('x', aTests[j]);
319 30 : sal_Int32 nPos = -1;
320 30 : size_t i = 0;
321 240 : do
322 : {
323 240 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions));
324 240 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
325 240 : CPPUNIT_ASSERT(nPos == aDoublePositions[i++]);
326 : }
327 240 : while (nPos < aTest.getLength());
328 30 : nPos = aTest.getLength();
329 30 : i = SAL_N_ELEMENTS(aDoublePositions)-1;
330 210 : do
331 : {
332 210 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
333 210 : CPPUNIT_ASSERT(nPos == aDoublePositions[--i]);
334 : }
335 : while (nPos > 0);
336 30 : }
337 :
338 1 : const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
339 30 : for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j)
340 : {
341 29 : OUString aTest = aBase.replaceAll(OUString("xx"), OUString(aTests[j]));
342 29 : sal_Int32 nPos = -1;
343 29 : size_t i = 0;
344 232 : do
345 : {
346 232 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions));
347 232 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
348 232 : CPPUNIT_ASSERT(nPos == aSinglePositions[i++]);
349 : }
350 232 : while (nPos < aTest.getLength());
351 29 : nPos = aTest.getLength();
352 29 : i = SAL_N_ELEMENTS(aSinglePositions)-1;
353 203 : do
354 : {
355 203 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
356 203 : CPPUNIT_ASSERT(nPos == aSinglePositions[--i]);
357 : }
358 : while (nPos > 0);
359 29 : }
360 :
361 1 : const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
362 1 : CPPUNIT_ASSERT(aTests[0] == '\'');
363 : {
364 1 : OUString aTest = aBase.replaceAll(OUString("xx"), OUString(aTests[0]));
365 1 : sal_Int32 nPos = -1;
366 1 : size_t i = 0;
367 4 : do
368 : {
369 4 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions));
370 4 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
371 4 : CPPUNIT_ASSERT(nPos == aSingleQuotePositions[i++]);
372 : }
373 4 : while (nPos < aTest.getLength());
374 1 : nPos = aTest.getLength();
375 1 : i = SAL_N_ELEMENTS(aSingleQuotePositions)-1;
376 3 : do
377 : {
378 3 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
379 3 : CPPUNIT_ASSERT(nPos == aSingleQuotePositions[--i]);
380 : }
381 1 : while (nPos > 0);
382 1 : }
383 : }
384 :
385 : //See https://issues.apache.org/ooo/show_bug.cgi?id=13451
386 : {
387 1 : aLocale.Language = "ca";
388 1 : aLocale.Country = "ES";
389 :
390 1 : OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!");
391 :
392 1 : sal_Int32 nPos = 0;
393 1 : sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
394 1 : size_t i = 0;
395 7 : do
396 : {
397 7 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
398 7 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
399 7 : i18n::WordType::DICTIONARY_WORD, true).endPos;
400 7 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
401 : }
402 7 : while (nPos++ < aTest.getLength());
403 1 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
404 : }
405 :
406 : //See https://issues.apache.org/ooo/show_bug.cgi?id=85411
407 3 : for (int j = 0; j < 2; ++j)
408 : {
409 2 : switch (j)
410 : {
411 : case 0:
412 1 : aLocale.Language = "en";
413 1 : aLocale.Country = "US";
414 1 : break;
415 : case 1:
416 1 : aLocale.Language = "ca";
417 1 : aLocale.Country = "ES";
418 1 : break;
419 : default:
420 0 : CPPUNIT_ASSERT(false);
421 0 : break;
422 : }
423 :
424 : const sal_Unicode TEST[] =
425 : {
426 : 'I', 0x200B, 'w', 'a', 'n', 't', 0x200B, 't', 'o', 0x200B, 'g', 'o'
427 2 : };
428 2 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
429 :
430 2 : sal_Int32 nPos = 0;
431 2 : sal_Int32 aExpected[] = {1, 6, 9, 12};
432 2 : size_t i = 0;
433 8 : do
434 : {
435 8 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
436 8 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
437 8 : i18n::WordType::DICTIONARY_WORD, true).endPos;
438 8 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
439 : }
440 8 : while (nPos++ < aTest.getLength());
441 2 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
442 2 : }
443 :
444 : //https://issues.apache.org/ooo/show_bug.cgi?id=21290
445 3 : for (int j = 0; j < 2; ++j)
446 : {
447 2 : switch (j)
448 : {
449 : case 0:
450 1 : aLocale.Language = "en";
451 1 : aLocale.Country = "US";
452 1 : break;
453 : case 1:
454 1 : aLocale.Language = "grc";
455 1 : aLocale.Country = "";
456 1 : break;
457 : default:
458 0 : CPPUNIT_ASSERT(false);
459 0 : break;
460 : }
461 :
462 : const sal_Unicode TEST[] =
463 : {
464 : 0x1F0C, 0x03BD, 0x03B4, 0x03C1, 0x03B1, 0x0020, 0x1F00,
465 : 0x03C1, 0x03BD, 0x1F7B, 0x03BC, 0x03B5, 0x03BD, 0x03BF,
466 : 0x03C2, 0x0020, 0x1F00, 0x03BB, 0x03BB, 0x0020, 0x1F24,
467 : 0x03C3, 0x03B8, 0x03B9, 0x03BF, 0x03BD
468 2 : };
469 2 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
470 :
471 2 : sal_Int32 nPos = 0;
472 2 : sal_Int32 aExpected[] = {5, 15, 19, 26};
473 2 : size_t i = 0;
474 8 : do
475 : {
476 8 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
477 8 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
478 8 : i18n::WordType::DICTIONARY_WORD, true).endPos;
479 8 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
480 : }
481 8 : while (nPos++ < aTest.getLength());
482 2 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
483 2 : }
484 :
485 : //See https://issues.apache.org/ooo/show_bug.cgi?id=58513
486 : {
487 1 : aLocale.Language = "fi";
488 1 : aLocale.Country = "FI";
489 :
490 1 : OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi");
491 :
492 : {
493 1 : sal_Int32 nPos = 0;
494 1 : sal_Int32 aExpected[] = {12, 22, 25, 36};
495 1 : size_t i = 0;
496 4 : do
497 : {
498 4 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
499 4 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
500 4 : i18n::WordType::WORD_COUNT, true).endPos;
501 4 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
502 : }
503 4 : while (nPos++ < aTest.getLength());
504 1 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
505 : }
506 :
507 : {
508 1 : sal_Int32 nPos = 0;
509 1 : sal_Int32 aExpected[] = {0, 11, 12, 21, 22, 24, 25, 36};
510 1 : size_t i = 0;
511 4 : do
512 : {
513 4 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
514 4 : aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
515 4 : i18n::WordType::DICTIONARY_WORD, true);
516 4 : CPPUNIT_ASSERT(aExpected[i++] == aBounds.startPos);
517 4 : CPPUNIT_ASSERT(aExpected[i++] == aBounds.endPos);
518 4 : nPos = aBounds.endPos;
519 : }
520 4 : while (nPos++ < aTest.getLength());
521 1 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
522 1 : }
523 : }
524 :
525 : //See https://issues.apache.org/ooo/show_bug.cgi?id=107843
526 : {
527 1 : aLocale.Language = "en";
528 1 : aLocale.Country = "US";
529 :
530 : const sal_Unicode TEST[] =
531 : {
532 : 'r', 'u', 0xFB00, 'l', 'e', ' ', 0xFB01, 's', 'h'
533 1 : };
534 1 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
535 :
536 1 : aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
537 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
538 :
539 1 : aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
540 1 : CPPUNIT_ASSERT(aBounds.startPos == 6 && aBounds.endPos == 9);
541 : }
542 :
543 : //See https://issues.apache.org/ooo/show_bug.cgi?id=113785
544 : {
545 1 : aLocale.Language = "en";
546 1 : aLocale.Country = "US";
547 :
548 : const sal_Unicode TEST[] =
549 : {
550 : 'a', 0x2013, 'b', 0x2014, 'c'
551 1 : };
552 1 : OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
553 :
554 1 : aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
555 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 1);
556 :
557 1 : aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
558 1 : CPPUNIT_ASSERT(aBounds.startPos == 2 && aBounds.endPos == 3);
559 :
560 1 : aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
561 1 : CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 5);
562 1 : }
563 1 : }
564 :
565 : //See https://bugs.libreoffice.org/show_bug.cgi?id=40292
566 : //See https://issues.apache.org/ooo/show_bug.cgi?id=80412
567 : //See https://issues.apache.org/ooo/show_bug.cgi?id=111152
568 : //See https://issues.apache.org/ooo/show_bug.cgi?id=50172
569 1 : void TestBreakIterator::testGraphemeIteration()
570 : {
571 1 : lang::Locale aLocale;
572 1 : aLocale.Language = "bn";
573 1 : aLocale.Country = "IN";
574 :
575 : {
576 1 : const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
577 1 : OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
578 :
579 1 : sal_Int32 nDone=0;
580 : sal_Int32 nPos;
581 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
582 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
583 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(BA_HALANT_LA));
584 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale,
585 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
586 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
587 : }
588 :
589 : {
590 1 : const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
591 1 : OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
592 :
593 1 : sal_Int32 nDone=0;
594 : sal_Int32 nPos;
595 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
596 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
597 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
598 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale,
599 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
600 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
601 : }
602 :
603 : {
604 1 : const sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
605 1 : OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
606 :
607 1 : sal_Int32 nDone=0;
608 : sal_Int32 nPos;
609 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
610 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
611 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
612 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale,
613 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
614 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
615 : }
616 :
617 1 : aLocale.Language = "ta";
618 1 : aLocale.Country = "IN";
619 :
620 : {
621 1 : const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
622 1 : OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
623 :
624 1 : sal_Int32 nDone=0;
625 1 : sal_Int32 nPos = 0;
626 :
627 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
628 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
629 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(KA_VIRAMA_SSA));
630 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
631 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
632 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
633 : }
634 :
635 : {
636 1 : const sal_Unicode KA_VOWELSIGNU[] = { 0x0B95, 0x0BC1 };
637 1 : OUString aTest(KA_VOWELSIGNU, SAL_N_ELEMENTS(KA_VOWELSIGNU));
638 :
639 1 : sal_Int32 nDone=0;
640 1 : sal_Int32 nPos = 0;
641 :
642 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
643 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
644 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(KA_VOWELSIGNU));
645 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VOWELSIGNU), aLocale,
646 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
647 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
648 : }
649 :
650 : {
651 : const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] =
652 1 : { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
653 : OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI,
654 1 : SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
655 :
656 1 : sal_Int32 nDone=0;
657 1 : sal_Int32 nPos=0;
658 :
659 5 : for (sal_Int32 i = 0; i < 4; ++i)
660 : {
661 4 : sal_Int32 nOldPos = nPos;
662 4 : nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
663 4 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
664 4 : CPPUNIT_ASSERT_MESSAGE("Should skip 2 units", nPos == nOldPos+2);
665 : }
666 :
667 5 : for (sal_Int32 i = 0; i < 4; ++i)
668 : {
669 4 : sal_Int32 nOldPos = nPos;
670 4 : nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
671 4 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
672 4 : CPPUNIT_ASSERT_MESSAGE("Should skip 2 units", nPos == nOldPos-2);
673 1 : }
674 : }
675 :
676 : {
677 1 : const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
678 1 : OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
679 :
680 1 : sal_Int32 nGraphemeCount = 0;
681 :
682 1 : sal_Int32 nCurPos = 0;
683 3 : while (nCurPos < aText.getLength())
684 : {
685 1 : sal_Int32 nCount2 = 1;
686 1 : nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
687 1 : i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
688 1 : ++nGraphemeCount;
689 : }
690 :
691 1 : CPPUNIT_ASSERT_MESSAGE("Should be considered 1 grapheme", nGraphemeCount == 1);
692 : }
693 :
694 1 : aLocale.Language = "hi";
695 1 : aLocale.Country = "IN";
696 :
697 : {
698 1 : const sal_Unicode SHA_VOWELSIGNII[] = { 0x936, 0x940 };
699 1 : OUString aTest(SHA_VOWELSIGNII, SAL_N_ELEMENTS(SHA_VOWELSIGNII));
700 :
701 1 : sal_Int32 nDone=0;
702 1 : sal_Int32 nPos = 0;
703 :
704 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
705 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
706 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(SHA_VOWELSIGNII));
707 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(SHA_VOWELSIGNII), aLocale,
708 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
709 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
710 1 : }
711 1 : }
712 :
713 : //A test to ensure that certain ranges and codepoints that are categorized as
714 : //weak remain as weak, so that existing docs that depend on this don't silently
715 : //change font for those weak chars
716 1 : void TestBreakIterator::testWeak()
717 : {
718 1 : lang::Locale aLocale;
719 1 : aLocale.Language = "en";
720 1 : aLocale.Country = "US";
721 :
722 : {
723 : const sal_Unicode WEAKS[] =
724 : {
725 : 0x0001, 0x0002,
726 : 0x0020, 0x00A0,
727 : 0x2150, 0x215F, //Number Forms, fractions
728 : 0x2160, 0x2180, //Number Forms, roman numerals
729 : 0x2200, 0x22FF, //Mathematical Operators
730 : 0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
731 : 0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
732 : 0x2A00, 0x2AFF, //Supplemental Mathematical Operators
733 : 0x2100, 0x214F, //Letterlike Symbols
734 : 0x2308, 0x230B, //Miscellaneous technical
735 : 0x25A0, 0x25FF, //Geometric Shapes
736 : 0x2B30, 0x2B4C //Miscellaneous Symbols and Arrows
737 1 : };
738 1 : OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS));
739 :
740 25 : for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
741 : {
742 24 : sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
743 24 : OStringBuffer aMsg;
744 24 : aMsg.append("Char 0x");
745 24 : aMsg.append(static_cast<sal_Int32>(aWeaks[i]), 16);
746 24 : aMsg.append(" should have been weak");
747 48 : CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
748 24 : nScript == i18n::ScriptType::WEAK);
749 25 : }
750 1 : }
751 1 : }
752 :
753 : //A test to ensure that certain ranges and codepoints that are categorized as
754 : //asian remain as asian, so that existing docs that depend on this don't silently
755 : //change font for those asian chars.
756 : //See https://bugs.libreoffice.org/show_bug.cgi?id=38095
757 1 : void TestBreakIterator::testAsian()
758 : {
759 1 : lang::Locale aLocale;
760 1 : aLocale.Language = "en";
761 1 : aLocale.Country = "US";
762 :
763 : {
764 : const sal_Unicode ASIANS[] =
765 : {
766 : //some typical CJK chars
767 : 0x4E00, 0x62FF,
768 : //The full HalfWidth and FullWidth block has historically been
769 : //designated as taking the CJK font :-(
770 : //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
771 : //UAX24 as "Common" i.e. by that logic WEAK
772 : 0xFF10, 0xFF19,
773 : //HalfWidth and FullWidth forms of ASCII A-z, categorized under
774 : //UAX25 as "Latin", i.e. by that logic LATIN
775 : 0xFF21, 0xFF5A
776 1 : };
777 1 : OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
778 :
779 7 : for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
780 : {
781 6 : sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
782 6 : OStringBuffer aMsg;
783 6 : aMsg.append("Char 0x");
784 6 : aMsg.append(static_cast<sal_Int32>(aAsians[i]), 16);
785 6 : aMsg.append(" should have been asian");
786 12 : CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
787 6 : nScript == i18n::ScriptType::ASIAN);
788 7 : }
789 1 : }
790 1 : }
791 :
792 : //A test to ensure that our Lao word boundary detection is useful
793 0 : void TestBreakIterator::testLao()
794 : {
795 0 : lang::Locale aLocale;
796 0 : aLocale.Language = "lo";
797 0 : aLocale.Country = "LA";
798 :
799 0 : const sal_Unicode LAO[] = { 0x0e8d, 0x0eb4, 0x0e99, 0x0e94, 0x0eb5, 0x0e95, 0x0ec9, 0x0ead, 0x0e99, 0x0eae, 0x0eb1, 0x0e9a };
800 0 : OUString aTest(LAO, SAL_N_ELEMENTS(LAO));
801 0 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
802 0 : i18n::WordType::DICTIONARY_WORD, true);
803 :
804 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
805 :
806 0 : aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
807 0 : i18n::WordType::DICTIONARY_WORD, true);
808 :
809 0 : CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 9);
810 :
811 0 : }
812 :
813 : //A test to ensure that our thai word boundary detection is useful
814 1 : void TestBreakIterator::testThai()
815 : {
816 1 : lang::Locale aLocale;
817 1 : aLocale.Language = "th";
818 1 : aLocale.Country = "TH";
819 :
820 : //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
821 : {
822 1 : const sal_Unicode THAI[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
823 1 : OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
824 1 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
825 1 : i18n::WordType::DICTIONARY_WORD, true);
826 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full word",
827 2 : aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
828 : }
829 :
830 : //See https://issues.apache.org/ooo/show_bug.cgi?id=29548
831 : //make sure forwards and back are consistent
832 : {
833 : const sal_Unicode THAI[] =
834 : {
835 : 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
836 : 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
837 : 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27,
838 : 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
839 : 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
840 : 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27
841 1 : };
842 1 : OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
843 :
844 2 : std::stack<sal_Int32> aPositions;
845 1 : sal_Int32 nPos = -1;
846 11 : do
847 : {
848 11 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
849 11 : aPositions.push(nPos);
850 : }
851 11 : while (nPos < aTest.getLength());
852 1 : nPos = aTest.getLength();
853 1 : CPPUNIT_ASSERT(!aPositions.empty());
854 1 : aPositions.pop();
855 10 : do
856 : {
857 10 : CPPUNIT_ASSERT(!aPositions.empty());
858 10 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
859 10 : CPPUNIT_ASSERT(nPos == aPositions.top());
860 10 : aPositions.pop();
861 : }
862 11 : while (nPos > 0);
863 1 : }
864 1 : }
865 :
866 : #ifdef TODO
867 : void TestBreakIterator::testNorthernThai()
868 : {
869 : lang::Locale aLocale;
870 : aLocale.Language = "nod";
871 : aLocale.Country = "TH";
872 :
873 : const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
874 : OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
875 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
876 : i18n::WordType::DICTIONARY_WORD, true);
877 : CPPUNIT_ASSERT_MESSAGE("Should skip full word",
878 : aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
879 : }
880 : #endif
881 :
882 : #if (U_ICU_VERSION_MAJOR_NUM > 4)
883 : // Not sure if any version earlier than 49 did have Khmer word boundary
884 : // dictionaries, 4.6 does not.
885 :
886 : //A test to ensure that our khmer word boundary detection is useful
887 : //https://bugs.libreoffice.org/show_bug.cgi?id=52020
888 1 : void TestBreakIterator::testKhmer()
889 : {
890 1 : lang::Locale aLocale;
891 1 : aLocale.Language = "km";
892 1 : aLocale.Country = "KH";
893 :
894 1 : const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
895 :
896 2 : OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
897 1 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
898 1 : i18n::WordType::DICTIONARY_WORD, true);
899 :
900 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
901 :
902 1 : aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
903 1 : i18n::WordType::DICTIONARY_WORD, true);
904 :
905 2 : CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
906 1 : }
907 : #endif
908 :
909 1 : void TestBreakIterator::testJapanese()
910 : {
911 1 : lang::Locale aLocale;
912 1 : aLocale.Language = "ja";
913 1 : aLocale.Country = "JP";
914 1 : i18n::Boundary aBounds;
915 :
916 : {
917 1 : const sal_Unicode JAPANESE[] = { 0x30B7, 0x30E3, 0x30C3, 0x30C8, 0x30C0, 0x30A6, 0x30F3 };
918 :
919 1 : OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
920 1 : aBounds = m_xBreak->getWordBoundary(aTest, 5, aLocale,
921 1 : i18n::WordType::DICTIONARY_WORD, true);
922 :
923 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
924 : }
925 :
926 : {
927 1 : const sal_Unicode JAPANESE[] = { 0x9EBB, 0x306E, 0x8449, 0x9EBB, 0x306E, 0x8449 };
928 :
929 1 : OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
930 1 : aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale,
931 1 : i18n::WordType::DICTIONARY_WORD, true);
932 :
933 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
934 :
935 1 : aBounds = m_xBreak->getWordBoundary(aTest, 5, aLocale,
936 1 : i18n::WordType::DICTIONARY_WORD, true);
937 :
938 1 : CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 6);
939 1 : }
940 1 : }
941 :
942 1 : void TestBreakIterator::testChinese()
943 : {
944 1 : lang::Locale aLocale;
945 1 : aLocale.Language = "zh";
946 1 : aLocale.Country = "CN";
947 1 : i18n::Boundary aBounds;
948 :
949 : {
950 1 : const sal_Unicode CHINESE[] = { 0x6A35, 0x6A30, 0x69FE, 0x8919, 0xD867, 0xDEDB };
951 :
952 1 : OUString aTest(CHINESE, SAL_N_ELEMENTS(CHINESE));
953 1 : aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale,
954 1 : i18n::WordType::DICTIONARY_WORD, true);
955 1 : CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 6);
956 1 : }
957 1 : }
958 9 : void TestBreakIterator::setUp()
959 : {
960 9 : BootstrapFixtureBase::setUp();
961 27 : m_xBreak = uno::Reference< i18n::XBreakIterator >(m_xSFactory->createInstance(
962 18 : "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
963 9 : }
964 :
965 9 : void TestBreakIterator::tearDown()
966 : {
967 9 : m_xBreak.clear();
968 9 : BootstrapFixtureBase::tearDown();
969 9 : }
970 :
971 1 : CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
972 :
973 4 : CPPUNIT_PLUGIN_IMPLEMENT();
974 :
975 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|