Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 :
21 : // xdictionary.cpp: implementation of the xdictionary class.
22 :
23 :
24 :
25 :
26 : #include <rtl/ustrbuf.hxx>
27 :
28 : #include <com/sun/star/i18n/WordType.hpp>
29 : #include <xdictionary.hxx>
30 : #include <unicode/uchar.h>
31 : #include <string.h>
32 : #include <breakiteratorImpl.hxx>
33 :
34 :
35 : // Construction/Destruction
36 :
37 :
38 :
39 : namespace com { namespace sun { namespace star { namespace i18n {
40 :
41 : #ifndef DISABLE_DYNLOADING
42 :
43 0 : extern "C" { static void SAL_CALL thisModule() {} }
44 :
45 : #else
46 :
47 : extern "C" {
48 :
49 : sal_uInt8* getExistMark_ja();
50 : sal_Int16* getIndex1_ja();
51 : sal_Int32* getIndex2_ja();
52 : sal_Int32* getLenArray_ja();
53 : sal_Unicode* getDataArea_ja();
54 :
55 : sal_uInt8* getExistMark_zh();
56 : sal_Int16* getIndex1_zh();
57 : sal_Int32* getIndex2_zh();
58 : sal_Int32* getLenArray_zh();
59 : sal_Unicode* getDataArea_zh();
60 :
61 : }
62 :
63 : #endif
64 :
65 0 : xdictionary::xdictionary(const sal_Char *lang) :
66 : existMark( NULL ),
67 : index1( NULL ),
68 : index2( NULL ),
69 : lenArray( NULL ),
70 : dataArea( NULL ),
71 : #ifndef DISABLE_DYNLOADING
72 : hModule( NULL ),
73 : #endif
74 : boundary(),
75 0 : japaneseWordBreak( sal_False )
76 : {
77 0 : index1 = 0;
78 : #ifndef DISABLE_DYNLOADING
79 : #ifdef SAL_DLLPREFIX
80 0 : OUStringBuffer aBuf( strlen(lang) + 7 + 6 ); // mostly "lib*.so" (with * == dict_zh)
81 0 : aBuf.appendAscii( SAL_DLLPREFIX );
82 : #else
83 : OUStringBuffer aBuf( strlen(lang) + 7 + 4 ); // mostly "*.dll" (with * == dict_zh)
84 : #endif
85 0 : aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
86 0 : hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
87 0 : if( hModule ) {
88 : sal_IntPtr (*func)();
89 0 : func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getExistMark").pData );
90 0 : existMark = (sal_uInt8*) (*func)();
91 0 : func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getIndex1").pData );
92 0 : index1 = (sal_Int16*) (*func)();
93 0 : func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getIndex2").pData );
94 0 : index2 = (sal_Int32*) (*func)();
95 0 : func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getLenArray").pData );
96 0 : lenArray = (sal_Int32*) (*func)();
97 0 : func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getDataArea").pData );
98 0 : dataArea = (sal_Unicode*) (*func)();
99 : }
100 : else
101 : {
102 0 : existMark = NULL;
103 0 : index1 = NULL;
104 0 : index2 = NULL;
105 0 : lenArray = NULL;
106 0 : dataArea = NULL;
107 : }
108 :
109 : #else
110 : if( strcmp( lang, "ja" ) == 0 ) {
111 : existMark = getExistMark_ja();
112 : index1 = getIndex1_ja();
113 : index2 = getIndex2_ja();
114 : lenArray = getLenArray_ja();
115 : dataArea = getDataArea_ja();
116 : }
117 : else if( strcmp( lang, "zh" ) == 0 ) {
118 : existMark = getExistMark_zh();
119 : index1 = getIndex1_zh();
120 : index2 = getIndex2_zh();
121 : lenArray = getLenArray_zh();
122 : dataArea = getDataArea_zh();
123 : }
124 : else
125 : {
126 : existMark = NULL;
127 : index1 = NULL;
128 : index2 = NULL;
129 : lenArray = NULL;
130 : dataArea = NULL;
131 : }
132 : #endif
133 :
134 0 : for (sal_Int32 i = 0; i < CACHE_MAX; i++)
135 0 : cache[i].size = 0;
136 :
137 0 : japaneseWordBreak = sal_False;
138 0 : }
139 :
140 0 : xdictionary::~xdictionary()
141 : {
142 : #ifndef DISABLE_DYNLOADING
143 0 : osl_unloadModule(hModule);
144 : #endif
145 0 : for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
146 0 : if (cache[i].size > 0) {
147 0 : delete [] cache[i].contents;
148 0 : delete [] cache[i].wordboundary;
149 : }
150 : }
151 0 : }
152 :
153 0 : void xdictionary::setJapaneseWordBreak()
154 : {
155 0 : japaneseWordBreak = sal_True;
156 0 : }
157 :
158 0 : sal_Bool xdictionary::exists(const sal_uInt32 c)
159 : {
160 : // 0x1FFF is the hardcoded limit in gendict for existMarks
161 0 : sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
162 0 : if (!exist && japaneseWordBreak)
163 0 : return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
164 : else
165 0 : return exist;
166 : }
167 :
168 0 : sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen)
169 : {
170 :
171 0 : if ( !index1 ) return 0;
172 :
173 0 : sal_Int16 idx = index1[str[0] >> 8];
174 :
175 0 : if (idx == 0xFF) return 0;
176 :
177 0 : idx = (idx<<8) | (str[0]&0xff);
178 :
179 0 : sal_uInt32 begin = index2[idx], end = index2[idx+1];
180 :
181 0 : if (begin == 0) return 0;
182 :
183 0 : str++; sLen--; // first character is not stored in the dictionary
184 0 : for (sal_uInt32 i = end; i > begin; i--) {
185 0 : sal_Int32 len = lenArray[i] - lenArray[i - 1];
186 0 : if (sLen >= len) {
187 0 : const sal_Unicode *dstr = dataArea + lenArray[i-1];
188 0 : sal_Int32 pos = 0;
189 :
190 0 : while (pos < len && dstr[pos] == str[pos]) { pos++; }
191 :
192 0 : if (pos == len)
193 0 : return len + 1;
194 : }
195 : }
196 0 : return 0;
197 : }
198 :
199 :
200 : /*
201 : * c-tor
202 : */
203 :
204 0 : WordBreakCache::WordBreakCache() :
205 : length( 0 ),
206 : contents( NULL ),
207 : wordboundary( NULL ),
208 0 : size( 0 )
209 : {
210 0 : }
211 :
212 : /*
213 : * Compare two unicode string,
214 : */
215 :
216 0 : sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary)
217 : {
218 : // Different length, different string.
219 0 : if (length != boundary.endPos - boundary.startPos) return sal_False;
220 :
221 0 : for (sal_Int32 i = 0; i < length; i++)
222 0 : if (contents[i] != str[i + boundary.startPos]) return sal_False;
223 :
224 0 : return sal_True;
225 : }
226 :
227 :
228 : /*
229 : * Retrieve the segment containing the character at pos.
230 : * @param pos : Position of the given character.
231 : * @return true if CJK.
232 : */
233 0 : sal_Bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
234 : Boundary& segBoundary)
235 : {
236 : sal_Int32 indexUtf16;
237 0 : segBoundary.endPos = segBoundary.startPos = pos;
238 :
239 0 : indexUtf16 = pos;
240 0 : while (indexUtf16 > 0)
241 : {
242 0 : sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
243 0 : if (u_isWhitespace(ch) || exists(ch))
244 0 : segBoundary.startPos = indexUtf16;
245 : else
246 0 : break;
247 : }
248 :
249 0 : indexUtf16 = pos;
250 0 : while (indexUtf16 < rText.getLength())
251 : {
252 0 : sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
253 0 : if (u_isWhitespace(ch) || exists(ch))
254 0 : segBoundary.endPos = indexUtf16;
255 : else
256 0 : break;
257 : }
258 :
259 0 : indexUtf16 = segBoundary.startPos;
260 0 : rText.iterateCodePoints(&indexUtf16, 1);
261 0 : return segBoundary.endPos > indexUtf16;
262 : }
263 :
264 : #define KANJA 1
265 : #define KATAKANA 2
266 : #define HIRAKANA 3
267 :
268 0 : static sal_Int16 JapaneseCharType(sal_Unicode c)
269 : {
270 0 : if (0x3041 <= c && c <= 0x309e)
271 0 : return HIRAKANA;
272 0 : if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
273 0 : return KATAKANA;
274 0 : return KANJA;
275 : }
276 :
277 0 : WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
278 : {
279 0 : WordBreakCache& rCache = cache[text[0] & 0x1f];
280 :
281 0 : if (rCache.size != 0 && rCache.equals(text, wordBoundary))
282 0 : return rCache;
283 :
284 0 : sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
285 :
286 0 : if (rCache.size == 0 || len > rCache.size) {
287 0 : if (rCache.size != 0) {
288 0 : delete [] rCache.contents;
289 0 : delete [] rCache.wordboundary;
290 0 : rCache.size = len;
291 : }
292 : else
293 0 : rCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
294 0 : rCache.contents = new sal_Unicode[rCache.size + 1];
295 0 : rCache.wordboundary = new sal_Int32[rCache.size + 2];
296 : }
297 0 : rCache.length = len;
298 0 : memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
299 0 : *(rCache.contents + len) = 0x0000;
300 : // reset the wordboundary in cache
301 0 : memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
302 :
303 0 : sal_Int32 i = 0; // loop variable
304 0 : while (rCache.wordboundary[i] < rCache.length) {
305 0 : len = 0;
306 : // look the continuous white space as one word and cashe it
307 0 : while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + rCache.wordboundary[i] + len]))
308 0 : len ++;
309 :
310 0 : if (len == 0) {
311 0 : const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
312 0 : sal_Int32 slen = rCache.length - rCache.wordboundary[i];
313 0 : sal_Int16 type = 0, count = 0;
314 0 : for (;len == 0 && slen > 0; str++, slen--) {
315 0 : len = getLongestMatch(str, slen);
316 0 : if (len == 0) {
317 0 : if (!japaneseWordBreak) {
318 0 : len = 1;
319 : } else {
320 0 : if (count == 0)
321 0 : type = JapaneseCharType(*str);
322 0 : else if (type != JapaneseCharType(*str))
323 0 : break;
324 0 : count++;
325 : }
326 : }
327 : }
328 0 : if (count)
329 : {
330 0 : rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
331 0 : i++;
332 : }
333 : }
334 :
335 0 : if (len) {
336 0 : rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
337 0 : i++;
338 : }
339 : }
340 0 : rCache.wordboundary[i + 1] = rCache.length + 1;
341 :
342 0 : return rCache;
343 : }
344 :
345 0 : Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
346 : {
347 : // looking for the first non-whitespace character from anyPos
348 0 : sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
349 :
350 0 : while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
351 :
352 0 : return getWordBoundary(rText, anyPos, wordType, true);
353 : }
354 :
355 0 : Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
356 : {
357 0 : boundary = getWordBoundary(rText, anyPos, wordType, true);
358 0 : anyPos = boundary.endPos;
359 0 : if (anyPos < rText.getLength()) {
360 : // looknig for the first non-whitespace character from anyPos
361 0 : sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
362 0 : while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
363 0 : rText.iterateCodePoints(&anyPos, -1);
364 : }
365 :
366 0 : return getWordBoundary(rText, anyPos, wordType, true);
367 : }
368 :
369 0 : Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
370 : {
371 0 : const sal_Unicode *text=rText.getStr();
372 0 : sal_Int32 len=rText.getLength();
373 0 : if (anyPos >= len || anyPos < 0) {
374 0 : boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
375 0 : } else if (seekSegment(rText, anyPos, boundary)) { // character in dict
376 0 : WordBreakCache& aCache = getCache(text, boundary);
377 0 : sal_Int32 i = 0;
378 :
379 0 : while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
380 :
381 0 : sal_Int32 startPos = aCache.wordboundary[i - 1];
382 : // if bDirection is false
383 0 : if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
384 : {
385 0 : sal_Int32 indexUtf16 = anyPos-1;
386 0 : sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
387 0 : if (u_isWhitespace(ch))
388 0 : i--;
389 : }
390 :
391 0 : boundary.endPos = boundary.startPos;
392 0 : boundary.endPos += aCache.wordboundary[i];
393 0 : boundary.startPos += aCache.wordboundary[i-1];
394 :
395 : } else {
396 0 : boundary.startPos = anyPos;
397 0 : if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
398 0 : boundary.endPos = anyPos < len ? anyPos : len;
399 : }
400 0 : if (wordType == WordType::WORD_COUNT) {
401 : // skip punctuation for word count.
402 0 : while (boundary.endPos < len)
403 : {
404 0 : sal_Int32 indexUtf16 = boundary.endPos;
405 0 : if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
406 0 : boundary.endPos = indexUtf16;
407 : else
408 0 : break;
409 : }
410 : }
411 :
412 0 : return boundary;
413 : }
414 :
415 : } } } }
416 :
417 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|