Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include <config_folders.h>
21 :
22 : #include <osl/file.h>
23 : #include <osl/mutex.hxx>
24 : #include <rtl/ustrbuf.hxx>
25 : #include <rtl/bootstrap.hxx>
26 : #include <com/sun/star/i18n/WordType.hpp>
27 : #include <xdictionary.hxx>
28 : #include <unicode/uchar.h>
29 : #include <string.h>
30 : #include <breakiteratorImpl.hxx>
31 :
32 : namespace com { namespace sun { namespace star { namespace i18n {
33 :
34 : #ifdef DICT_JA_ZH_IN_DATAFILE
35 :
36 : #elif !defined DISABLE_DYNLOADING
37 :
38 0 : extern "C" { static void SAL_CALL thisModule() {} }
39 :
40 : #else
41 :
42 : extern "C" {
43 :
44 : sal_uInt8* getExistMark_ja();
45 : sal_Int16* getIndex1_ja();
46 : sal_Int32* getIndex2_ja();
47 : sal_Int32* getLenArray_ja();
48 : sal_Unicode* getDataArea_ja();
49 :
50 : sal_uInt8* getExistMark_zh();
51 : sal_Int16* getIndex1_zh();
52 : sal_Int32* getIndex2_zh();
53 : sal_Int32* getLenArray_zh();
54 : sal_Unicode* getDataArea_zh();
55 :
56 : }
57 :
58 : #endif
59 :
60 102 : xdictionary::xdictionary(const sal_Char *lang) :
61 : boundary(),
62 102 : japaneseWordBreak( false )
63 : {
64 :
65 : #ifdef DICT_JA_ZH_IN_DATAFILE
66 :
67 : if( strcmp( lang, "ja" ) == 0 || strcmp( lang, "zh" ) == 0 )
68 : {
69 : OUString sUrl( "$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER "/dict_" );
70 : rtl::Bootstrap::expandMacros(sUrl);
71 :
72 : if( strcmp( lang, "ja" ) == 0 )
73 : sUrl += "ja.data";
74 : else if( strcmp( lang, "zh" ) == 0 )
75 : sUrl += "zh.data";
76 :
77 : oslFileHandle aFileHandle;
78 : sal_uInt64 nFileSize;
79 : char *pMapping;
80 : if( osl_openFile( sUrl.pData, &aFileHandle, osl_File_OpenFlag_Read ) == osl_File_E_None &&
81 : osl_getFileSize( aFileHandle, &nFileSize) == osl_File_E_None &&
82 : osl_mapFile( aFileHandle, (void **) &pMapping, nFileSize, 0, osl_File_MapFlag_RandomAccess ) == osl_File_E_None )
83 : {
84 : // We have the offsets to the parts of the file at its end, see gendict.cxx
85 : sal_Int64 *pEOF = (sal_Int64*)(pMapping + nFileSize);
86 :
87 : data.existMark = (sal_uInt8*) (pMapping + pEOF[-1]);
88 : data.index2 = (sal_Int32*) (pMapping + pEOF[-2]);
89 : data.index1 = (sal_Int16*) (pMapping + pEOF[-3]);
90 : data.lenArray = (sal_Int32*) (pMapping + pEOF[-4]);
91 : data.dataArea = (sal_Unicode*) (pMapping + pEOF[-5]);
92 : }
93 : }
94 :
95 : #elif !defined DISABLE_DYNLOADING
96 :
97 102 : initDictionaryData( lang );
98 :
99 : #else
100 :
101 : if( strcmp( lang, "ja" ) == 0 ) {
102 : data.existMark = getExistMark_ja();
103 : data.index1 = getIndex1_ja();
104 : data.index2 = getIndex2_ja();
105 : data.lenArray = getLenArray_ja();
106 : data.dataArea = getDataArea_ja();
107 : }
108 : else if( strcmp( lang, "zh" ) == 0 ) {
109 : data.existMark = getExistMark_zh();
110 : data.index1 = getIndex1_zh();
111 : data.index2 = getIndex2_zh();
112 : data.lenArray = getLenArray_zh();
113 : data.dataArea = getDataArea_zh();
114 : }
115 :
116 : #endif
117 :
118 3366 : for (sal_Int32 i = 0; i < CACHE_MAX; i++)
119 3264 : cache[i].size = 0;
120 :
121 102 : japaneseWordBreak = false;
122 102 : }
123 :
124 204 : xdictionary::~xdictionary()
125 : {
126 3366 : for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
127 3264 : if (cache[i].size > 0) {
128 10 : delete [] cache[i].contents;
129 10 : delete [] cache[i].wordboundary;
130 : }
131 : }
132 102 : }
133 :
134 : namespace {
135 76 : struct datacache {
136 : oslModule mhModule;
137 : OString maLang;
138 : xdictionarydata maData;
139 : };
140 : }
141 :
142 : #if !defined(DICT_JA_ZH_IN_DATAFILE) && !defined(DISABLE_DYNLOADING)
143 :
144 102 : void xdictionary::initDictionaryData(const sal_Char *pLang)
145 : {
146 : // Global cache, never released for performance
147 102 : static std::vector< datacache > aLoadedCache;
148 :
149 102 : osl::MutexGuard aGuard( osl::Mutex::getGlobalMutex() );
150 104 : for( size_t i = 0; i < aLoadedCache.size(); ++i )
151 : {
152 86 : if( !strcmp( pLang, aLoadedCache[ i ].maLang.getStr() ) )
153 : {
154 84 : data = aLoadedCache[ i ].maData;
155 186 : return;
156 : }
157 : }
158 :
159 : // otherwise add to the cache, positive or negative.
160 36 : datacache aEntry;
161 18 : aEntry.maLang = OString( pLang, strlen( pLang ) );
162 :
163 : #ifdef SAL_DLLPREFIX
164 36 : OUStringBuffer aBuf( strlen( pLang ) + 7 + 6 ); // mostly "lib*.so" (with * == dict_zh)
165 18 : aBuf.appendAscii( SAL_DLLPREFIX );
166 : #else
167 : OUStringBuffer aBuf( strlen( pLang ) + 7 + 4 ); // mostly "*.dll" (with * == dict_zh)
168 : #endif
169 18 : aBuf.appendAscii( "dict_" ).appendAscii( pLang ).appendAscii( SAL_DLLEXTENSION );
170 18 : aEntry.mhModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
171 18 : if( aEntry.mhModule ) {
172 : oslGenericFunction func;
173 18 : func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getExistMark" );
174 18 : aEntry.maData.existMark = ((sal_uInt8 const * (*)()) func)();
175 18 : func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex1" );
176 18 : aEntry.maData.index1 = ((sal_Int16 const * (*)()) func)();
177 18 : func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex2" );
178 18 : aEntry.maData.index2 = ((sal_Int32 const * (*)()) func)();
179 18 : func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getLenArray" );
180 18 : aEntry.maData.lenArray = ((sal_Int32 const * (*)()) func)();
181 18 : func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getDataArea" );
182 18 : aEntry.maData.dataArea = ((sal_Unicode const * (*)()) func)();
183 : }
184 :
185 18 : data = aEntry.maData;
186 36 : aLoadedCache.push_back( aEntry );
187 : }
188 :
189 : #endif
190 :
191 4 : void xdictionary::setJapaneseWordBreak()
192 : {
193 4 : japaneseWordBreak = true;
194 4 : }
195 :
196 202 : bool xdictionary::exists(const sal_uInt32 c)
197 : {
198 : // 0x1FFF is the hardcoded limit in gendict for data.existMarks
199 202 : bool exist = (data.existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((data.existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
200 202 : if (!exist && japaneseWordBreak)
201 0 : return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
202 : else
203 202 : return exist;
204 : }
205 :
206 28 : sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen)
207 : {
208 28 : if ( !data.index1 ) return 0;
209 :
210 28 : sal_Int16 idx = data.index1[str[0] >> 8];
211 :
212 28 : if (idx == 0xFF) return 0;
213 :
214 28 : idx = (idx<<8) | (str[0]&0xff);
215 :
216 28 : sal_uInt32 begin = data.index2[idx], end = data.index2[idx+1];
217 :
218 28 : if (begin == 0) return 0;
219 :
220 28 : str++; sLen--; // first character is not stored in the dictionary
221 4400 : for (sal_uInt32 i = end; i > begin; i--) {
222 4390 : sal_Int32 len = data.lenArray[i] - data.lenArray[i - 1];
223 4390 : if (sLen >= len) {
224 1768 : const sal_Unicode *dstr = data.dataArea + data.lenArray[i-1];
225 1768 : sal_Int32 pos = 0;
226 :
227 1768 : while (pos < len && dstr[pos] == str[pos]) { pos++; }
228 :
229 1768 : if (pos == len)
230 18 : return len + 1;
231 : }
232 : }
233 10 : return 0;
234 : }
235 :
236 :
237 : /*
238 : * c-tor
239 : */
240 :
241 3264 : WordBreakCache::WordBreakCache() :
242 : length( 0 ),
243 : contents( NULL ),
244 : wordboundary( NULL ),
245 3264 : size( 0 )
246 : {
247 3264 : }
248 :
249 : /*
250 : * Compare two unicode string,
251 : */
252 :
253 36 : bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary)
254 : {
255 : // Different length, different string.
256 36 : if (length != boundary.endPos - boundary.startPos) return false;
257 :
258 122 : for (sal_Int32 i = 0; i < length; i++)
259 96 : if (contents[i] != str[i + boundary.startPos]) return false;
260 :
261 26 : return true;
262 : }
263 :
264 :
265 : /*
266 : * Retrieve the segment containing the character at pos.
267 : * @param pos : Position of the given character.
268 : * @return true if CJK.
269 : */
270 102 : bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
271 : Boundary& segBoundary)
272 : {
273 : sal_Int32 indexUtf16;
274 :
275 102 : if (segmentCachedString.pData != rText.pData) {
276 : // Cache the passed text so we can avoid regenerating the segment if it's the same
277 : // (pData is refcounted and assigning the OUString references it, which ensures that
278 : // the object is the same if we get the same pointer back later)
279 14 : segmentCachedString = rText;
280 : } else {
281 : // If pos is within the cached boundary, use that boundary
282 88 : if (pos >= segmentCachedBoundary.startPos && pos <= segmentCachedBoundary.endPos) {
283 32 : segBoundary.startPos = segmentCachedBoundary.startPos;
284 32 : segBoundary.endPos = segmentCachedBoundary.endPos;
285 32 : indexUtf16 = segmentCachedBoundary.startPos;
286 32 : rText.iterateCodePoints(&indexUtf16, 1);
287 32 : return segmentCachedBoundary.endPos > indexUtf16;
288 : }
289 : }
290 :
291 70 : segBoundary.endPos = segBoundary.startPos = pos;
292 :
293 70 : indexUtf16 = pos;
294 174 : while (indexUtf16 > 0)
295 : {
296 92 : sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
297 92 : if (u_isWhitespace(ch) || exists(ch))
298 34 : segBoundary.startPos = indexUtf16;
299 : else
300 58 : break;
301 : }
302 :
303 70 : indexUtf16 = pos;
304 200 : while (indexUtf16 < rText.getLength())
305 : {
306 122 : sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
307 122 : if (u_isWhitespace(ch) || exists(ch))
308 60 : segBoundary.endPos = indexUtf16;
309 : else
310 62 : break;
311 : }
312 :
313 : // Cache the calculated boundary
314 70 : segmentCachedBoundary.startPos = segBoundary.startPos;
315 70 : segmentCachedBoundary.endPos = segBoundary.endPos;
316 :
317 70 : indexUtf16 = segBoundary.startPos;
318 70 : rText.iterateCodePoints(&indexUtf16, 1);
319 70 : return segBoundary.endPos > indexUtf16;
320 : }
321 :
322 : #define KANJA 1
323 : #define KATAKANA 2
324 : #define HIRAKANA 3
325 :
326 0 : static sal_Int16 JapaneseCharType(sal_Unicode c)
327 : {
328 0 : if (0x3041 <= c && c <= 0x309e)
329 0 : return HIRAKANA;
330 0 : if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
331 0 : return KATAKANA;
332 0 : return KANJA;
333 : }
334 :
335 46 : WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
336 : {
337 46 : WordBreakCache& rCache = cache[text[0] & 0x1f];
338 :
339 46 : if (rCache.size != 0 && rCache.equals(text, wordBoundary))
340 26 : return rCache;
341 :
342 20 : sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
343 :
344 20 : if (rCache.size == 0 || len > rCache.size) {
345 10 : if (rCache.size != 0) {
346 0 : delete [] rCache.contents;
347 0 : delete [] rCache.wordboundary;
348 0 : rCache.size = len;
349 : }
350 : else
351 10 : rCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
352 10 : rCache.contents = new sal_Unicode[rCache.size + 1];
353 10 : rCache.wordboundary = new sal_Int32[rCache.size + 2];
354 : }
355 20 : rCache.length = len;
356 20 : memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
357 20 : *(rCache.contents + len) = 0x0000;
358 : // reset the wordboundary in cache
359 20 : memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
360 :
361 20 : sal_Int32 i = 0; // loop variable
362 74 : while (rCache.wordboundary[i] < rCache.length) {
363 34 : len = 0;
364 : // look the continuous white space as one word and cashe it
365 80 : while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + rCache.wordboundary[i] + len]))
366 12 : len ++;
367 :
368 34 : if (len == 0) {
369 28 : const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
370 28 : sal_Int32 slen = rCache.length - rCache.wordboundary[i];
371 28 : sal_Int16 type = 0, count = 0;
372 56 : for (;len == 0 && slen > 0; str++, slen--) {
373 28 : len = getLongestMatch(str, slen);
374 28 : if (len == 0) {
375 10 : if (!japaneseWordBreak) {
376 10 : len = 1;
377 : } else {
378 0 : if (count == 0)
379 0 : type = JapaneseCharType(*str);
380 0 : else if (type != JapaneseCharType(*str))
381 0 : break;
382 0 : count++;
383 : }
384 : }
385 : }
386 28 : if (count)
387 : {
388 0 : rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
389 0 : i++;
390 : }
391 : }
392 :
393 34 : if (len) {
394 34 : rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
395 34 : i++;
396 : }
397 : }
398 20 : rCache.wordboundary[i + 1] = rCache.length + 1;
399 :
400 20 : return rCache;
401 : }
402 :
403 0 : Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
404 : {
405 : // looking for the first non-whitespace character from anyPos
406 0 : sal_uInt32 ch = 0;
407 0 : if (anyPos > 0)
408 0 : rText.iterateCodePoints(&anyPos, -1);
409 :
410 0 : while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
411 :
412 0 : return getWordBoundary(rText, anyPos, wordType, true);
413 : }
414 :
415 0 : Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
416 : {
417 0 : boundary = getWordBoundary(rText, anyPos, wordType, true);
418 0 : anyPos = boundary.endPos;
419 0 : const sal_Int32 nLen = rText.getLength();
420 0 : if (anyPos < nLen) {
421 : // looknig for the first non-whitespace character from anyPos
422 0 : sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
423 0 : while (u_isWhitespace(ch) && (anyPos < nLen)) ch=rText.iterateCodePoints(&anyPos, 1);
424 0 : if (anyPos > 0)
425 0 : rText.iterateCodePoints(&anyPos, -1);
426 : }
427 :
428 0 : return getWordBoundary(rText, anyPos, wordType, true);
429 : }
430 :
431 102 : Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, bool bDirection)
432 : {
433 102 : const sal_Unicode *text=rText.getStr();
434 102 : sal_Int32 len=rText.getLength();
435 102 : if (anyPos >= len || anyPos < 0) {
436 0 : boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
437 102 : } else if (seekSegment(rText, anyPos, boundary)) { // character in dict
438 46 : WordBreakCache& aCache = getCache(text, boundary);
439 46 : sal_Int32 i = 0;
440 :
441 46 : while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
442 :
443 46 : sal_Int32 startPos = aCache.wordboundary[i - 1];
444 : // if bDirection is false
445 46 : if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
446 : {
447 0 : sal_Int32 indexUtf16 = anyPos-1;
448 0 : sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
449 0 : if (u_isWhitespace(ch))
450 0 : i--;
451 : }
452 :
453 46 : boundary.endPos = boundary.startPos;
454 46 : boundary.endPos += aCache.wordboundary[i];
455 46 : boundary.startPos += aCache.wordboundary[i-1];
456 :
457 : } else {
458 56 : boundary.startPos = anyPos;
459 56 : if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
460 56 : boundary.endPos = anyPos < len ? anyPos : len;
461 : }
462 102 : if (wordType == WordType::WORD_COUNT) {
463 : // skip punctuation for word count.
464 182 : while (boundary.endPos < len)
465 : {
466 94 : sal_Int32 indexUtf16 = boundary.endPos;
467 94 : if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
468 6 : boundary.endPos = indexUtf16;
469 : else
470 88 : break;
471 : }
472 : }
473 :
474 102 : return boundary;
475 : }
476 :
477 : } } } }
478 :
479 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|