Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * Version: MPL 1.1 / GPLv3+ / LGPLv3+
4 : *
5 : * The contents of this file are subject to the Mozilla Public License Version
6 : * 1.1 (the "License"); you may not use this file except in compliance with
7 : * the License. You may obtain a copy of the License at
8 : * http://www.mozilla.org/MPL/
9 : *
10 : * Software distributed under the License is distributed on an "AS IS" basis,
11 : * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 : * for the specific language governing rights and limitations under the
13 : * License.
14 : *
15 : * The Initial Developer of the Original Code is
16 : * Caolán McNamara <caolanm@redhat.com>
17 : *
18 : * Contributor(s):
19 : * Caolán McNamara <caolanm@redhat.com>
20 : *
21 : * Alternatively, the contents of this file may be used under the terms of
22 : * either the GNU General Public License Version 3 or later (the "GPLv3+"), or
23 : * the GNU Lesser General Public License Version 3 or later (the "LGPLv3+"),
24 : * in which case the provisions of the GPLv3+ or the LGPLv3+ are applicable
25 : * instead of those above.
26 : */
27 :
28 : #include <cppuhelper/compbase1.hxx>
29 : #include <cppuhelper/bootstrap.hxx>
30 : #include <cppuhelper/basemutex.hxx>
31 : #include <com/sun/star/i18n/XBreakIterator.hpp>
32 : #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
33 : #include <com/sun/star/i18n/ScriptType.hpp>
34 : #include <com/sun/star/i18n/WordType.hpp>
35 : #include <unotest/bootstrapfixturebase.hxx>
36 :
37 : #include <unicode/uvernum.h>
38 :
39 : #include <rtl/strbuf.hxx>
40 : #include <rtl/ustrbuf.hxx>
41 :
42 : #include <string.h>
43 :
44 : #include <stack>
45 :
46 : using namespace ::com::sun::star;
47 :
48 18 : class TestBreakIterator : public test::BootstrapFixtureBase
49 : {
50 : public:
51 : virtual void setUp();
52 : virtual void tearDown();
53 :
54 : void testLineBreaking();
55 : void testWordBoundaries();
56 : void testGraphemeIteration();
57 : void testWeak();
58 : void testAsian();
59 : void testThai();
60 : #if TODO
61 : void testNorthernThai();
62 : #endif
63 : void testKhmer();
64 : void testJapanese();
65 :
66 2 : CPPUNIT_TEST_SUITE(TestBreakIterator);
67 1 : CPPUNIT_TEST(testLineBreaking);
68 1 : CPPUNIT_TEST(testGraphemeIteration);
69 1 : CPPUNIT_TEST(testWeak);
70 1 : CPPUNIT_TEST(testAsian);
71 1 : CPPUNIT_TEST(testThai);
72 : #if TODO
73 : CPPUNIT_TEST(testNorthernThai);
74 : #endif
75 : #if (U_ICU_VERSION_MAJOR_NUM > 4)
76 : CPPUNIT_TEST(testWordBoundaries);
77 : CPPUNIT_TEST(testKhmer);
78 : #endif
79 1 : CPPUNIT_TEST(testJapanese);
80 2 : CPPUNIT_TEST_SUITE_END();
81 : private:
82 : uno::Reference<i18n::XBreakIterator> m_xBreak;
83 : };
84 :
85 1 : void TestBreakIterator::testLineBreaking()
86 : {
87 1 : i18n::LineBreakHyphenationOptions aHyphOptions;
88 1 : i18n::LineBreakUserOptions aUserOptions;
89 1 : lang::Locale aLocale;
90 :
91 : //See https://bugs.freedesktop.org/show_bug.cgi?id=31271
92 : {
93 1 : rtl::OUString aTest(RTL_CONSTASCII_USTRINGPARAM("(some text here)"));
94 :
95 1 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("en"));
96 1 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US"));
97 :
98 : {
99 : //Here we want the line break to leave text here) on the next line
100 1 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some tex"), aLocale, 0, aHyphOptions, aUserOptions);
101 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the the start of the word", aResult.breakIndex == 6);
102 : }
103 :
104 : {
105 : //Here we want the line break to leave "here)" on the next line
106 1 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("(some text here"), aLocale, 0, aHyphOptions, aUserOptions);
107 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the the start of the word", aResult.breakIndex == 11);
108 1 : }
109 : }
110 :
111 : //See https://bugs.freedesktop.org/show_bug.cgi?id=49849
112 : {
113 1 : const sal_Unicode HEBREW1[] = { 0x05DE, 0x05D9, 0x05DC, 0x05D9, 0x5DD };
114 1 : rtl::OUString aWord(HEBREW1, SAL_N_ELEMENTS(HEBREW1));
115 1 : rtl::OUString aTest(rtl::OUStringBuffer(aWord).append(' ').append(aWord).makeStringAndClear());
116 :
117 1 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("he"));
118 1 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("IL"));
119 :
120 : {
121 : //Here we want the line break to happen at the whitespace
122 1 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-1, aLocale, 0, aHyphOptions, aUserOptions);
123 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the the start of the word", aResult.breakIndex == aWord.getLength()+1);
124 1 : }
125 : }
126 :
127 : //See https://issues.apache.org/ooo/show_bug.cgi?id=17155
128 : {
129 1 : rtl::OUString aTest(RTL_CONSTASCII_USTRINGPARAM("foo /bar/baz"));
130 :
131 1 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("en"));
132 1 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US"));
133 :
134 : {
135 : //Here we want the line break to leave /bar/ba clumped together on the next line
136 1 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, strlen("foo /bar/ba"), aLocale, 0,
137 1 : aHyphOptions, aUserOptions);
138 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the first slash", aResult.breakIndex == 4);
139 1 : }
140 : }
141 :
142 : //See https://issues.apache.org/ooo/show_bug.cgi?id=19716
143 : {
144 1 : rtl::OUString aTest(RTL_CONSTASCII_USTRINGPARAM("aaa]aaa"));
145 :
146 1 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("en"));
147 1 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US"));
148 :
149 : {
150 : //Here we want the line break to move the whole lot to the next line
151 2 : i18n::LineBreakResults aResult = m_xBreak->getLineBreak(aTest, aTest.getLength()-2, aLocale, 0,
152 2 : aHyphOptions, aUserOptions);
153 1 : CPPUNIT_ASSERT_MESSAGE("Expected a break at the start of the line, not at ]", aResult.breakIndex == 0);
154 1 : }
155 1 : }
156 1 : }
157 :
158 : //See https://bugs.freedesktop.org/show_bug.cgi?id=49629
159 0 : void TestBreakIterator::testWordBoundaries()
160 : {
161 0 : lang::Locale aLocale;
162 0 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("en"));
163 0 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US"));
164 :
165 0 : i18n::Boundary aBounds;
166 :
167 : //See https://issues.apache.org/ooo/show_bug.cgi?id=11993
168 : {
169 0 : rtl::OUString aTest("abcd ef ghi??? KLM");
170 :
171 0 : CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
172 0 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD));
173 0 : aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, true);
174 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
175 :
176 0 : CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
177 0 : CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD));
178 :
179 : //next word
180 0 : aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, true);
181 0 : CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
182 :
183 : //previous word
184 0 : aBounds = m_xBreak->getWordBoundary(aTest, 8, aLocale, i18n::WordType::DICTIONARY_WORD, false);
185 0 : CPPUNIT_ASSERT(aBounds.startPos == 5 && aBounds.endPos == 7);
186 :
187 0 : CPPUNIT_ASSERT(!m_xBreak->isBeginWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
188 0 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD));
189 0 : aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, true);
190 0 : CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 12);
191 :
192 0 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
193 0 : CPPUNIT_ASSERT(!m_xBreak->isEndWord(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD));
194 0 : aBounds = m_xBreak->getWordBoundary(aTest, 16, aLocale, i18n::WordType::DICTIONARY_WORD, true);
195 0 : CPPUNIT_ASSERT(aBounds.startPos == 16 && aBounds.endPos == 19);
196 : }
197 :
198 : //See https://issues.apache.org/ooo/show_bug.cgi?id=21907
199 : {
200 0 : rtl::OUString aTest("b a?");
201 :
202 0 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
203 0 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
204 0 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
205 :
206 0 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
207 :
208 0 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 1, aLocale, i18n::WordType::ANY_WORD));
209 0 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 2, aLocale, i18n::WordType::ANY_WORD));
210 0 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANY_WORD));
211 :
212 0 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, 3, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES));
213 : }
214 :
215 : //See https://issues.apache.org/ooo/show_bug.cgi?id=14904
216 : {
217 : const sal_Unicode TEST[] =
218 : {
219 : 'W', 'o', 'r', 'k', 'i', 'n', 'g', ' ', 0x201C, 'W', 'o', 'r', 'd', 's',
220 : ' ', 's', 't', 'a', 'r', 't', 'i', 'n', 'g', ' ', 'w', 'i', 't',
221 : 'h', ' ', 'q', 'u', 'o', 't', 'e', 's', 0x201D, ' ', 'W', 'o', 'r', 'k',
222 : 'i', 'n', 'g', ' ', 0x2018, 'B', 'r', 'o', 'k', 'e', 'n', 0x2019, ' ',
223 : '?', 'S', 'p', 'a', 'n', 'i', 's', 'h', '?', ' ', 'd', 'o', 'e',
224 : 's', 'n', 0x2019, 't', ' ', 'w', 'o', 'r', 'k', '.', ' ', 'N', 'o',
225 : 't', ' ', 'e', 'v', 'e', 'n', ' ' , 0x00BF, 'r', 'e', 'a', 'l', '?', ' ',
226 : 'S', 'p', 'a', 'n', 'i', 's', 'h'
227 0 : };
228 0 : rtl::OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
229 :
230 0 : aBounds = m_xBreak->getWordBoundary(aTest, 4, aLocale, i18n::WordType::DICTIONARY_WORD, false);
231 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
232 :
233 0 : aBounds = m_xBreak->getWordBoundary(aTest, 12, aLocale, i18n::WordType::DICTIONARY_WORD, false);
234 0 : CPPUNIT_ASSERT(aBounds.startPos == 9 && aBounds.endPos == 14);
235 :
236 0 : aBounds = m_xBreak->getWordBoundary(aTest, 40, aLocale, i18n::WordType::DICTIONARY_WORD, false);
237 0 : CPPUNIT_ASSERT(aBounds.startPos == 37 && aBounds.endPos == 44);
238 :
239 0 : aBounds = m_xBreak->getWordBoundary(aTest, 49, aLocale, i18n::WordType::DICTIONARY_WORD, false);
240 0 : CPPUNIT_ASSERT(aBounds.startPos == 46 && aBounds.endPos == 52);
241 :
242 0 : aBounds = m_xBreak->getWordBoundary(aTest, 58, aLocale, i18n::WordType::DICTIONARY_WORD, false);
243 0 : CPPUNIT_ASSERT(aBounds.startPos == 55 && aBounds.endPos == 62);
244 :
245 0 : aBounds = m_xBreak->getWordBoundary(aTest, 67, aLocale, i18n::WordType::DICTIONARY_WORD, false);
246 0 : CPPUNIT_ASSERT(aBounds.startPos == 64 && aBounds.endPos == 71);
247 :
248 0 : aBounds = m_xBreak->getWordBoundary(aTest, 90, aLocale, i18n::WordType::DICTIONARY_WORD, false);
249 0 : CPPUNIT_ASSERT(aBounds.startPos == 88 && aBounds.endPos == 92);
250 : }
251 :
252 : //See https://bugs.freedesktop.org/show_bug.cgi?id=49629
253 0 : sal_Unicode aBreakTests[] = { ' ', 1, 2, 3, 4, 5, 6, 7, 0x91, 0x92, 0x200B, 0xE8FF, 0xF8FF };
254 0 : for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
255 : {
256 : //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
257 0 : for (size_t i = 0; i < SAL_N_ELEMENTS(aBreakTests); ++i)
258 : {
259 0 : rtl::OUString aTest("Word");
260 0 : aTest += rtl::OUString(aBreakTests[i]) + rtl::OUString("Word");
261 0 : aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
262 0 : switch (mode)
263 : {
264 : case i18n::WordType::ANY_WORD:
265 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
266 0 : break;
267 : case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
268 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
269 0 : break;
270 : case i18n::WordType::DICTIONARY_WORD:
271 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
272 0 : break;
273 : case i18n::WordType::WORD_COUNT:
274 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 4);
275 0 : break;
276 : }
277 :
278 0 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
279 0 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
280 0 : }
281 : }
282 :
283 0 : sal_Unicode aJoinTests[] = { 'X', 0x200C, 0x200D, 0x2060, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB };
284 0 : for (int mode = i18n::WordType::ANY_WORD; mode <= i18n::WordType::WORD_COUNT; ++mode)
285 : {
286 : //make sure that in all cases isBeginWord and isEndWord matches getWordBoundary
287 0 : for (size_t i = 0; i < SAL_N_ELEMENTS(aJoinTests); ++i)
288 : {
289 0 : rtl::OUString aTest("Word");
290 0 : aTest += rtl::OUString(aJoinTests[i]) + rtl::OUString("Word");
291 0 : aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, mode, true);
292 0 : switch (mode)
293 : {
294 : case i18n::WordType::ANY_WORD:
295 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
296 0 : break;
297 : case i18n::WordType::ANYWORD_IGNOREWHITESPACES:
298 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
299 0 : break;
300 : case i18n::WordType::DICTIONARY_WORD:
301 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
302 0 : break;
303 : case i18n::WordType::WORD_COUNT:
304 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 9);
305 0 : break;
306 : }
307 :
308 0 : CPPUNIT_ASSERT(m_xBreak->isBeginWord(aTest, aBounds.startPos, aLocale, mode));
309 0 : CPPUNIT_ASSERT(m_xBreak->isEndWord(aTest, aBounds.endPos, aLocale, mode));
310 0 : }
311 : }
312 :
313 : //See https://issues.apache.org/ooo/show_bug.cgi?id=13494
314 : {
315 0 : const rtl::OUString aBase("xxAAxxBBxxCCxx");
316 : const sal_Unicode aTests[] =
317 : {
318 : '\'', ';', ',', '.', '!', '@', '#', '%', '&', '*',
319 : '(', ')', '_', '-', '{', '}', '[', ']', '\"', '/',
320 : '\\', '?', '~', '$', '+', '^', '=', '<', '>', '|'
321 0 : };
322 :
323 0 : const sal_Int32 aDoublePositions[] = {0, 2, 4, 6, 8, 10, 12, 14};
324 0 : for (size_t j = 0; j < SAL_N_ELEMENTS(aTests); ++j)
325 : {
326 0 : rtl::OUString aTest = aBase.replace('x', aTests[j]);
327 0 : sal_Int32 nPos = -1;
328 0 : size_t i = 0;
329 0 : do
330 : {
331 0 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aDoublePositions));
332 0 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
333 0 : CPPUNIT_ASSERT(nPos == aDoublePositions[i++]);
334 : }
335 0 : while (nPos < aTest.getLength());
336 0 : nPos = aTest.getLength();
337 0 : i = SAL_N_ELEMENTS(aDoublePositions)-1;
338 0 : do
339 : {
340 0 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
341 0 : CPPUNIT_ASSERT(nPos == aDoublePositions[--i]);
342 : }
343 : while (nPos > 0);
344 0 : }
345 :
346 0 : const sal_Int32 aSinglePositions[] = {0, 1, 3, 4, 6, 7, 9, 10};
347 0 : for (size_t j = 1; j < SAL_N_ELEMENTS(aTests); ++j)
348 : {
349 0 : rtl::OUString aTest = aBase.replaceAll(rtl::OUString("xx"), rtl::OUString(aTests[j]));
350 0 : sal_Int32 nPos = -1;
351 0 : size_t i = 0;
352 0 : do
353 : {
354 0 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSinglePositions));
355 0 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
356 0 : CPPUNIT_ASSERT(nPos == aSinglePositions[i++]);
357 : }
358 0 : while (nPos < aTest.getLength());
359 0 : nPos = aTest.getLength();
360 0 : i = SAL_N_ELEMENTS(aSinglePositions)-1;
361 0 : do
362 : {
363 0 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
364 0 : CPPUNIT_ASSERT(nPos == aSinglePositions[--i]);
365 : }
366 : while (nPos > 0);
367 0 : }
368 :
369 0 : const sal_Int32 aSingleQuotePositions[] = {0, 1, 9, 10};
370 0 : CPPUNIT_ASSERT(aTests[0] == '\'');
371 : {
372 0 : rtl::OUString aTest = aBase.replaceAll(rtl::OUString("xx"), rtl::OUString(aTests[0]));
373 0 : sal_Int32 nPos = -1;
374 0 : size_t i = 0;
375 0 : do
376 : {
377 0 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aSingleQuotePositions));
378 0 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
379 0 : CPPUNIT_ASSERT(nPos == aSingleQuotePositions[i++]);
380 : }
381 0 : while (nPos < aTest.getLength());
382 0 : nPos = aTest.getLength();
383 0 : i = SAL_N_ELEMENTS(aSingleQuotePositions)-1;
384 0 : do
385 : {
386 0 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
387 0 : CPPUNIT_ASSERT(nPos == aSingleQuotePositions[--i]);
388 : }
389 0 : while (nPos > 0);
390 0 : }
391 : }
392 :
393 : //See https://issues.apache.org/ooo/show_bug.cgi?id=13451
394 : {
395 0 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("ca"));
396 0 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("ES"));
397 :
398 0 : rtl::OUString aTest("mirar-se comprar-vos donem-nos les mans aneu-vos-en!");
399 :
400 0 : sal_Int32 nPos = 0;
401 0 : sal_Int32 aExpected[] = {8, 20, 30, 34, 39, 51, 52};
402 0 : size_t i = 0;
403 0 : do
404 : {
405 0 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
406 0 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
407 0 : i18n::WordType::DICTIONARY_WORD, true).endPos;
408 0 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
409 : }
410 0 : while (nPos++ < aTest.getLength());
411 0 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
412 : }
413 :
414 : //See https://issues.apache.org/ooo/show_bug.cgi?id=85411
415 0 : for (int j = 0; j < 2; ++j)
416 : {
417 0 : switch (j)
418 : {
419 : case 0:
420 0 : aLocale.Language = rtl::OUString("en");
421 0 : aLocale.Country = rtl::OUString("US");
422 0 : break;
423 : case 1:
424 0 : aLocale.Language = rtl::OUString("ca");
425 0 : aLocale.Country = rtl::OUString("ES");
426 0 : break;
427 : default:
428 0 : CPPUNIT_ASSERT(false);
429 0 : break;
430 : }
431 :
432 : const sal_Unicode TEST[] =
433 : {
434 : 'I', 0x200B, 'w', 'a', 'n', 't', 0x200B, 't', 'o', 0x200B, 'g', 'o'
435 0 : };
436 0 : rtl::OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
437 :
438 0 : sal_Int32 nPos = 0;
439 0 : sal_Int32 aExpected[] = {1, 6, 9, 12};
440 0 : size_t i = 0;
441 0 : do
442 : {
443 0 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
444 0 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
445 0 : i18n::WordType::DICTIONARY_WORD, true).endPos;
446 0 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
447 : }
448 0 : while (nPos++ < aTest.getLength());
449 0 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
450 0 : }
451 :
452 : //https://issues.apache.org/ooo/show_bug.cgi?id=21290
453 0 : for (int j = 0; j < 2; ++j)
454 : {
455 0 : switch (j)
456 : {
457 : case 0:
458 0 : aLocale.Language = rtl::OUString("en");
459 0 : aLocale.Country = rtl::OUString("US");
460 0 : break;
461 : case 1:
462 0 : aLocale.Language = rtl::OUString("grc");
463 0 : aLocale.Country = rtl::OUString();
464 0 : break;
465 : default:
466 0 : CPPUNIT_ASSERT(false);
467 0 : break;
468 : }
469 :
470 : const sal_Unicode TEST[] =
471 : {
472 : 0x1F0C, 0x03BD, 0x03B4, 0x03C1, 0x03B1, 0x0020, 0x1F00,
473 : 0x03C1, 0x03BD, 0x1F7B, 0x03BC, 0x03B5, 0x03BD, 0x03BF,
474 : 0x03C2, 0x0020, 0x1F00, 0x03BB, 0x03BB, 0x0020, 0x1F24,
475 : 0x03C3, 0x03B8, 0x03B9, 0x03BF, 0x03BD
476 0 : };
477 0 : rtl::OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
478 :
479 0 : sal_Int32 nPos = 0;
480 0 : sal_Int32 aExpected[] = {5, 15, 19, 26};
481 0 : size_t i = 0;
482 0 : do
483 : {
484 0 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
485 0 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
486 0 : i18n::WordType::DICTIONARY_WORD, true).endPos;
487 0 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
488 : }
489 0 : while (nPos++ < aTest.getLength());
490 0 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
491 0 : }
492 :
493 : //See https://issues.apache.org/ooo/show_bug.cgi?id=58513
494 : {
495 0 : aLocale.Language = "fi";
496 0 : aLocale.Country = "FI";
497 :
498 0 : rtl::OUString aTest("Kuorma-auto kaakkois- ja Keski-Suomi");
499 :
500 : {
501 0 : sal_Int32 nPos = 0;
502 0 : sal_Int32 aExpected[] = {12, 22, 25, 36};
503 0 : size_t i = 0;
504 0 : do
505 : {
506 0 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
507 0 : nPos = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
508 0 : i18n::WordType::WORD_COUNT, true).endPos;
509 0 : CPPUNIT_ASSERT(aExpected[i++] == nPos);
510 : }
511 0 : while (nPos++ < aTest.getLength());
512 0 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
513 : }
514 :
515 : {
516 0 : sal_Int32 nPos = 0;
517 0 : sal_Int32 aExpected[] = {0, 11, 12, 21, 22, 24, 25, 36};
518 0 : size_t i = 0;
519 0 : do
520 : {
521 0 : CPPUNIT_ASSERT(i < SAL_N_ELEMENTS(aExpected));
522 0 : aBounds = m_xBreak->getWordBoundary(aTest, nPos, aLocale,
523 0 : i18n::WordType::DICTIONARY_WORD, true);
524 0 : CPPUNIT_ASSERT(aExpected[i++] == aBounds.startPos);
525 0 : CPPUNIT_ASSERT(aExpected[i++] == aBounds.endPos);
526 0 : nPos = aBounds.endPos;
527 : }
528 0 : while (nPos++ < aTest.getLength());
529 0 : CPPUNIT_ASSERT(i == SAL_N_ELEMENTS(aExpected));
530 0 : }
531 : }
532 :
533 : //See https://issues.apache.org/ooo/show_bug.cgi?id=107843
534 : {
535 0 : aLocale.Language = rtl::OUString("en");
536 0 : aLocale.Country = rtl::OUString("US");
537 :
538 : const sal_Unicode TEST[] =
539 : {
540 : 'r', 'u', 0xFB00, 'l', 'e', ' ', 0xFB01, 's', 'h'
541 0 : };
542 0 : rtl::OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
543 :
544 0 : aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale, i18n::WordType::DICTIONARY_WORD, false);
545 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 5);
546 :
547 0 : aBounds = m_xBreak->getWordBoundary(aTest, 7, aLocale, i18n::WordType::DICTIONARY_WORD, false);
548 0 : CPPUNIT_ASSERT(aBounds.startPos == 6 && aBounds.endPos == 9);
549 : }
550 :
551 : //See https://issues.apache.org/ooo/show_bug.cgi?id=113785
552 : {
553 0 : aLocale.Language = rtl::OUString("en");
554 0 : aLocale.Country = rtl::OUString("US");
555 :
556 : const sal_Unicode TEST[] =
557 : {
558 : 'a', 0x2013, 'b', 0x2014, 'c'
559 0 : };
560 0 : rtl::OUString aTest(TEST, SAL_N_ELEMENTS(TEST));
561 :
562 0 : aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD, true);
563 0 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 1);
564 :
565 0 : aBounds = m_xBreak->nextWord(aTest, 0, aLocale, i18n::WordType::DICTIONARY_WORD);
566 0 : CPPUNIT_ASSERT(aBounds.startPos == 2 && aBounds.endPos == 3);
567 :
568 0 : aBounds = m_xBreak->nextWord(aTest, aBounds.endPos, aLocale, i18n::WordType::DICTIONARY_WORD);
569 0 : CPPUNIT_ASSERT(aBounds.startPos == 4 && aBounds.endPos == 5);
570 0 : }
571 0 : }
572 :
573 : //See https://bugs.freedesktop.org/show_bug.cgi?id=40292
574 : //See https://issues.apache.org/ooo/show_bug.cgi?id=80412
575 : //See https://issues.apache.org/ooo/show_bug.cgi?id=111152
576 : //See https://issues.apache.org/ooo/show_bug.cgi?id=50172
577 1 : void TestBreakIterator::testGraphemeIteration()
578 : {
579 1 : lang::Locale aLocale;
580 1 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bn"));
581 1 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("IN"));
582 :
583 : {
584 1 : const sal_Unicode BA_HALANT_LA[] = { 0x09AC, 0x09CD, 0x09AF };
585 1 : rtl::OUString aTest(BA_HALANT_LA, SAL_N_ELEMENTS(BA_HALANT_LA));
586 :
587 1 : sal_Int32 nDone=0;
588 : sal_Int32 nPos;
589 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
590 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
591 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(BA_HALANT_LA));
592 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(BA_HALANT_LA), aLocale,
593 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
594 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
595 : }
596 :
597 : {
598 1 : const sal_Unicode HA_HALANT_NA_VOWELSIGNI[] = { 0x09B9, 0x09CD, 0x09A3, 0x09BF };
599 1 : rtl::OUString aTest(HA_HALANT_NA_VOWELSIGNI, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
600 :
601 1 : sal_Int32 nDone=0;
602 : sal_Int32 nPos;
603 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
604 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
605 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI));
606 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(HA_HALANT_NA_VOWELSIGNI), aLocale,
607 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
608 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
609 : }
610 :
611 : {
612 1 : const sal_Unicode TA_HALANT_MA_HALANT_YA [] = { 0x09A4, 0x09CD, 0x09AE, 0x09CD, 0x09AF };
613 1 : rtl::OUString aTest(TA_HALANT_MA_HALANT_YA, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
614 :
615 1 : sal_Int32 nDone=0;
616 : sal_Int32 nPos;
617 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
618 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
619 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA));
620 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(TA_HALANT_MA_HALANT_YA), aLocale,
621 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
622 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
623 : }
624 :
625 1 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("ta"));
626 1 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("IN"));
627 :
628 : {
629 1 : const sal_Unicode KA_VIRAMA_SSA[] = { 0x0B95, 0x0BCD, 0x0BB7 };
630 1 : rtl::OUString aTest(KA_VIRAMA_SSA, SAL_N_ELEMENTS(KA_VIRAMA_SSA));
631 :
632 1 : sal_Int32 nDone=0;
633 1 : sal_Int32 nPos = 0;
634 :
635 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
636 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
637 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(KA_VIRAMA_SSA));
638 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VIRAMA_SSA), aLocale,
639 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
640 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
641 : }
642 :
643 : {
644 1 : const sal_Unicode KA_VOWELSIGNU[] = { 0x0B95, 0x0BC1 };
645 1 : rtl::OUString aTest(KA_VOWELSIGNU, SAL_N_ELEMENTS(KA_VOWELSIGNU));
646 :
647 1 : sal_Int32 nDone=0;
648 1 : sal_Int32 nPos = 0;
649 :
650 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
651 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
652 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(KA_VOWELSIGNU));
653 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(KA_VOWELSIGNU), aLocale,
654 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
655 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
656 : }
657 :
658 : {
659 : const sal_Unicode CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI[] =
660 1 : { 0x0B9A, 0x0BBF, 0x0BA4, 0x0BCD, 0x0BA4, 0x0BBF, 0x0BB0, 0x0BC8 };
661 : rtl::OUString aTest(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI,
662 1 : SAL_N_ELEMENTS(CA_VOWELSIGNI_TA_VIRAMA_TA_VOWELSIGNI_RA_VOWELSIGNAI));
663 :
664 1 : sal_Int32 nDone=0;
665 1 : sal_Int32 nPos=0;
666 :
667 5 : for (sal_Int32 i = 0; i < 4; ++i)
668 : {
669 4 : sal_Int32 nOldPos = nPos;
670 4 : nPos = m_xBreak->nextCharacters(aTest, nPos, aLocale,
671 4 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
672 4 : CPPUNIT_ASSERT_MESSAGE("Should skip 2 units", nPos == nOldPos+2);
673 : }
674 :
675 5 : for (sal_Int32 i = 0; i < 4; ++i)
676 : {
677 4 : sal_Int32 nOldPos = nPos;
678 4 : nPos = m_xBreak->previousCharacters(aTest, nPos, aLocale,
679 4 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
680 4 : CPPUNIT_ASSERT_MESSAGE("Should skip 2 units", nPos == nOldPos-2);
681 1 : }
682 : }
683 :
684 : {
685 1 : const sal_Unicode ALEF_QAMATS [] = { 0x05D0, 0x05B8 };
686 1 : rtl::OUString aText(ALEF_QAMATS, SAL_N_ELEMENTS(ALEF_QAMATS));
687 :
688 1 : sal_Int32 nGraphemeCount = 0;
689 :
690 1 : sal_Int32 nCurPos = 0;
691 3 : while (nCurPos < aText.getLength())
692 : {
693 1 : sal_Int32 nCount2 = 1;
694 1 : nCurPos = m_xBreak->nextCharacters(aText, nCurPos, lang::Locale(),
695 1 : i18n::CharacterIteratorMode::SKIPCELL, nCount2, nCount2);
696 1 : ++nGraphemeCount;
697 : }
698 :
699 1 : CPPUNIT_ASSERT_MESSAGE("Should be considered 1 grapheme", nGraphemeCount == 1);
700 : }
701 :
702 1 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("hi"));
703 1 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("IN"));
704 :
705 : {
706 1 : const sal_Unicode SHA_VOWELSIGNII[] = { 0x936, 0x940 };
707 1 : rtl::OUString aTest(SHA_VOWELSIGNII, SAL_N_ELEMENTS(SHA_VOWELSIGNII));
708 :
709 1 : sal_Int32 nDone=0;
710 1 : sal_Int32 nPos = 0;
711 :
712 1 : nPos = m_xBreak->nextCharacters(aTest, 0, aLocale,
713 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
714 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == SAL_N_ELEMENTS(SHA_VOWELSIGNII));
715 1 : nPos = m_xBreak->previousCharacters(aTest, SAL_N_ELEMENTS(SHA_VOWELSIGNII), aLocale,
716 1 : i18n::CharacterIteratorMode::SKIPCELL, 1, nDone);
717 1 : CPPUNIT_ASSERT_MESSAGE("Should skip full grapheme", nPos == 0);
718 1 : }
719 1 : }
720 :
721 : //A test to ensure that certain ranges and codepoints that are categorized as
722 : //weak remain as weak, so that existing docs that depend on this don't silently
723 : //change font for those weak chars
724 1 : void TestBreakIterator::testWeak()
725 : {
726 1 : lang::Locale aLocale;
727 1 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("en"));
728 1 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US"));
729 :
730 : {
731 : const sal_Unicode WEAKS[] =
732 : {
733 : 0x0001, 0x0002,
734 : 0x0020, 0x00A0,
735 : 0x2150, 0x215F, //Number Forms, fractions
736 : 0x2160, 0x2180, //Number Forms, roman numerals
737 : 0x2200, 0x22FF, //Mathematical Operators
738 : 0x27C0, 0x27EF, //Miscellaneous Mathematical Symbols-A
739 : 0x2980, 0x29FF, //Miscellaneous Mathematical Symbols-B
740 : 0x2A00, 0x2AFF, //Supplemental Mathematical Operators
741 : 0x2100, 0x214F, //Letterlike Symbols
742 : 0x2308, 0x230B, //Miscellaneous technical
743 : 0x25A0, 0x25FF, //Geometric Shapes
744 : 0x2B30, 0x2B4C //Miscellaneous Symbols and Arrows
745 1 : };
746 1 : rtl::OUString aWeaks(WEAKS, SAL_N_ELEMENTS(WEAKS));
747 :
748 25 : for (sal_Int32 i = 0; i < aWeaks.getLength(); ++i)
749 : {
750 24 : sal_Int16 nScript = m_xBreak->getScriptType(aWeaks, i);
751 24 : rtl::OStringBuffer aMsg;
752 24 : aMsg.append(RTL_CONSTASCII_STRINGPARAM("Char 0x"));
753 24 : aMsg.append(static_cast<sal_Int32>(aWeaks.getStr()[i]), 16);
754 24 : aMsg.append(RTL_CONSTASCII_STRINGPARAM(" should have been weak"));
755 48 : CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
756 24 : nScript == i18n::ScriptType::WEAK);
757 25 : }
758 1 : }
759 1 : }
760 :
761 : //A test to ensure that certain ranges and codepoints that are categorized as
762 : //asian remain as asian, so that existing docs that depend on this don't silently
763 : //change font for those asian chars.
764 : //See https://bugs.freedesktop.org/show_bug.cgi?id=38095
765 1 : void TestBreakIterator::testAsian()
766 : {
767 1 : lang::Locale aLocale;
768 1 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("en"));
769 1 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("US"));
770 :
771 : {
772 : const sal_Unicode ASIANS[] =
773 : {
774 : //some typical CJK chars
775 : 0x4E00, 0x62FF,
776 : //The full HalfWidth and FullWidth block has historically been
777 : //designated as taking the CJK font :-(
778 : //HalfWidth and FullWidth forms of ASCII 0-9, categorized under
779 : //UAX24 as "Common" i.e. by that logic WEAK
780 : 0xFF10, 0xFF19,
781 : //HalfWidth and FullWidth forms of ASCII A-z, categorized under
782 : //UAX25 as "Latin", i.e. by that logic LATIN
783 : 0xFF21, 0xFF5A
784 1 : };
785 1 : rtl::OUString aAsians(ASIANS, SAL_N_ELEMENTS(ASIANS));
786 :
787 7 : for (sal_Int32 i = 0; i < aAsians.getLength(); ++i)
788 : {
789 6 : sal_Int16 nScript = m_xBreak->getScriptType(aAsians, i);
790 6 : rtl::OStringBuffer aMsg;
791 6 : aMsg.append(RTL_CONSTASCII_STRINGPARAM("Char 0x"));
792 6 : aMsg.append(static_cast<sal_Int32>(aAsians.getStr()[i]), 16);
793 6 : aMsg.append(RTL_CONSTASCII_STRINGPARAM(" should have been asian"));
794 12 : CPPUNIT_ASSERT_MESSAGE(aMsg.getStr(),
795 6 : nScript == i18n::ScriptType::ASIAN);
796 7 : }
797 1 : }
798 1 : }
799 :
800 : //A test to ensure that our thai word boundary detection is useful
801 1 : void TestBreakIterator::testThai()
802 : {
803 1 : lang::Locale aLocale;
804 1 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("th"));
805 1 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("TH"));
806 :
807 : //See http://lists.freedesktop.org/archives/libreoffice/2012-February/025959.html
808 : {
809 1 : const sal_Unicode THAI[] = { 0x0E01, 0x0E38, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
810 1 : rtl::OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
811 1 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
812 1 : i18n::WordType::DICTIONARY_WORD, true);
813 2 : CPPUNIT_ASSERT_MESSAGE("Should skip full word",
814 2 : aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
815 : }
816 :
817 : //See https://issues.apache.org/ooo/show_bug.cgi?id=29548
818 : //make sure forwards and back are consistent
819 : {
820 : const sal_Unicode THAI[] =
821 : {
822 : 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
823 : 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
824 : 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27,
825 : 0x0E2D, 0x0E38, 0x0E17, 0x0E22, 0x0E32, 0x0E19, 0x0E41,
826 : 0x0E2B, 0x0E48, 0x0E07, 0x0E0A, 0x0E32, 0x0E15, 0x0E34,
827 : 0x0E19, 0x0E49, 0x0E33, 0x0E2B, 0x0E19, 0x0E32, 0x0E27
828 1 : };
829 1 : rtl::OUString aTest(THAI, SAL_N_ELEMENTS(THAI));
830 :
831 1 : std::stack<sal_Int32> aPositions;
832 1 : sal_Int32 nPos = -1;
833 11 : do
834 : {
835 11 : nPos = m_xBreak->nextWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
836 11 : aPositions.push(nPos);
837 : }
838 11 : while (nPos < aTest.getLength());
839 1 : nPos = aTest.getLength();
840 1 : CPPUNIT_ASSERT(!aPositions.empty());
841 1 : aPositions.pop();
842 10 : do
843 : {
844 10 : CPPUNIT_ASSERT(!aPositions.empty());
845 10 : nPos = m_xBreak->previousWord(aTest, nPos, aLocale, i18n::WordType::ANYWORD_IGNOREWHITESPACES).startPos;
846 10 : CPPUNIT_ASSERT(nPos == aPositions.top());
847 10 : aPositions.pop();
848 : }
849 1 : while (nPos > 0);
850 1 : }
851 1 : }
852 :
853 : #if TODO
854 : void TestBreakIterator::testNorthernThai()
855 : {
856 : lang::Locale aLocale;
857 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("nod"));
858 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("TH"));
859 :
860 : const sal_Unicode NORTHERN_THAI1[] = { 0x0E01, 0x0E38, 0x0E4A, 0x0E2B, 0x0E25, 0x0E32, 0x0E1A };
861 : rtl::OUString aTest(NORTHERN_THAI1, SAL_N_ELEMENTS(NORTHERN_THAI1));
862 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
863 : i18n::WordType::DICTIONARY_WORD, true);
864 : CPPUNIT_ASSERT_MESSAGE("Should skip full word",
865 : aBounds.startPos == 0 && aBounds.endPos == aTest.getLength());
866 : }
867 : #endif
868 :
869 : #if (U_ICU_VERSION_MAJOR_NUM > 4)
870 : //A test to ensure that our khmer word boundary detection is useful
871 : //https://bugs.freedesktop.org/show_bug.cgi?id=52020
872 : //
873 : //icu doesn't have the Khmer word boundary dictionaries in <= 4.0.0 but does in
874 : //the current 49.x.y . Not sure which version first had them introduced.
875 : void TestBreakIterator::testKhmer()
876 : {
877 : lang::Locale aLocale;
878 : aLocale.Language = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("km"));
879 : aLocale.Country = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("KH"));
880 :
881 : const sal_Unicode KHMER[] = { 0x17B2, 0x17D2, 0x1799, 0x1782, 0x17C1 };
882 :
883 : rtl::OUString aTest(KHMER, SAL_N_ELEMENTS(KHMER));
884 : i18n::Boundary aBounds = m_xBreak->getWordBoundary(aTest, 0, aLocale,
885 : i18n::WordType::DICTIONARY_WORD, true);
886 :
887 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
888 :
889 : aBounds = m_xBreak->getWordBoundary(aTest, aBounds.endPos, aLocale,
890 : i18n::WordType::DICTIONARY_WORD, true);
891 :
892 : CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 5);
893 : }
894 : #endif
895 :
896 1 : void TestBreakIterator::testJapanese()
897 : {
898 1 : lang::Locale aLocale;
899 1 : aLocale.Language = OUString("ja");
900 1 : aLocale.Country = OUString("JP");
901 1 : i18n::Boundary aBounds;
902 :
903 : {
904 1 : const sal_Unicode JAPANESE[] = { 0x30B7, 0x30E3, 0x30C3, 0x30C8, 0x30C0, 0x30A6, 0x30F3 };
905 :
906 1 : rtl::OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
907 1 : aBounds = m_xBreak->getWordBoundary(aTest, 5, aLocale,
908 1 : i18n::WordType::DICTIONARY_WORD, true);
909 :
910 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 7);
911 : }
912 :
913 : {
914 1 : const sal_Unicode JAPANESE[] = { 0x9EBB, 0x306E, 0x8449, 0x9EBB, 0x306E, 0x8449 };
915 :
916 1 : rtl::OUString aTest(JAPANESE, SAL_N_ELEMENTS(JAPANESE));
917 1 : aBounds = m_xBreak->getWordBoundary(aTest, 1, aLocale,
918 1 : i18n::WordType::DICTIONARY_WORD, true);
919 :
920 1 : CPPUNIT_ASSERT(aBounds.startPos == 0 && aBounds.endPos == 3);
921 :
922 1 : aBounds = m_xBreak->getWordBoundary(aTest, 5, aLocale,
923 1 : i18n::WordType::DICTIONARY_WORD, true);
924 :
925 1 : CPPUNIT_ASSERT(aBounds.startPos == 3 && aBounds.endPos == 6);
926 1 : }
927 1 : }
928 :
929 6 : void TestBreakIterator::setUp()
930 : {
931 6 : BootstrapFixtureBase::setUp();
932 6 : m_xBreak = uno::Reference< i18n::XBreakIterator >(m_xSFactory->createInstance(
933 6 : "com.sun.star.i18n.BreakIterator"), uno::UNO_QUERY_THROW);
934 6 : }
935 :
936 6 : void TestBreakIterator::tearDown()
937 : {
938 6 : m_xBreak.clear();
939 6 : BootstrapFixtureBase::tearDown();
940 6 : }
941 :
942 1 : CPPUNIT_TEST_SUITE_REGISTRATION(TestBreakIterator);
943 :
944 4 : CPPUNIT_PLUGIN_IMPLEMENT();
945 :
946 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|