LCOV - code coverage report
Current view: top level - i18npool/source/breakiterator - xdictionary.cxx (source / functions) Hit Total Coverage
Test: commit c8344322a7af75b84dd3ca8f78b05543a976dfd5 Lines: 156 195 80.0 %
Date: 2015-06-13 12:38:46 Functions: 14 18 77.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  *
       9             :  * This file incorporates work covered by the following license notice:
      10             :  *
      11             :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12             :  *   contributor license agreements. See the NOTICE file distributed
      13             :  *   with this work for additional information regarding copyright
      14             :  *   ownership. The ASF licenses this file to you under the Apache
      15             :  *   License, Version 2.0 (the "License"); you may not use this file
      16             :  *   except in compliance with the License. You may obtain a copy of
      17             :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18             :  */
      19             : 
      20             : #include <config_folders.h>
      21             : 
      22             : #include <osl/file.h>
      23             : #include <osl/mutex.hxx>
      24             : #include <rtl/ustrbuf.hxx>
      25             : #include <rtl/bootstrap.hxx>
      26             : #include <com/sun/star/i18n/WordType.hpp>
      27             : #include <xdictionary.hxx>
      28             : #include <unicode/uchar.h>
      29             : #include <string.h>
      30             : #include <breakiteratorImpl.hxx>
      31             : 
      32             : namespace com { namespace sun { namespace star { namespace i18n {
      33             : 
      34             : #ifdef DICT_JA_ZH_IN_DATAFILE
      35             : 
      36             : #elif !defined DISABLE_DYNLOADING
      37             : 
      38           0 : extern "C" { static void SAL_CALL thisModule() {} }
      39             : 
      40             : #else
      41             : 
      42             : extern "C" {
      43             : 
      44             : sal_uInt8* getExistMark_ja();
      45             : sal_Int16* getIndex1_ja();
      46             : sal_Int32* getIndex2_ja();
      47             : sal_Int32* getLenArray_ja();
      48             : sal_Unicode* getDataArea_ja();
      49             : 
      50             : sal_uInt8* getExistMark_zh();
      51             : sal_Int16* getIndex1_zh();
      52             : sal_Int32* getIndex2_zh();
      53             : sal_Int32* getLenArray_zh();
      54             : sal_Unicode* getDataArea_zh();
      55             : 
      56             : }
      57             : 
      58             : #endif
      59             : 
      60         603 : xdictionary::xdictionary(const sal_Char *lang) :
      61             :     boundary(),
      62         603 :     japaneseWordBreak( false )
      63             : {
      64             : 
      65             : #ifdef DICT_JA_ZH_IN_DATAFILE
      66             : 
      67             :     if( strcmp( lang, "ja" ) == 0 || strcmp( lang, "zh" ) == 0 )
      68             :     {
      69             :         OUString sUrl( "$BRAND_BASE_DIR/" LIBO_SHARE_FOLDER "/dict_" );
      70             :         rtl::Bootstrap::expandMacros(sUrl);
      71             : 
      72             :         if( strcmp( lang, "ja" ) == 0 )
      73             :             sUrl += "ja.data";
      74             :         else if( strcmp( lang, "zh" ) == 0 )
      75             :             sUrl += "zh.data";
      76             : 
      77             :         oslFileHandle aFileHandle;
      78             :         sal_uInt64 nFileSize;
      79             :         char *pMapping;
      80             :         if( osl_openFile( sUrl.pData, &aFileHandle, osl_File_OpenFlag_Read ) == osl_File_E_None &&
      81             :             osl_getFileSize( aFileHandle, &nFileSize) == osl_File_E_None &&
      82             :             osl_mapFile( aFileHandle, (void **) &pMapping, nFileSize, 0, osl_File_MapFlag_RandomAccess ) == osl_File_E_None )
      83             :         {
      84             :             // We have the offsets to the parts of the file at its end, see gendict.cxx
      85             :             sal_Int64 *pEOF = (sal_Int64*)(pMapping + nFileSize);
      86             : 
      87             :             data.existMark = (sal_uInt8*) (pMapping + pEOF[-1]);
      88             :             data.index2 = (sal_Int32*) (pMapping + pEOF[-2]);
      89             :             data.index1 = (sal_Int16*) (pMapping + pEOF[-3]);
      90             :             data.lenArray = (sal_Int32*) (pMapping + pEOF[-4]);
      91             :             data.dataArea = (sal_Unicode*) (pMapping + pEOF[-5]);
      92             :         }
      93             :     }
      94             : 
      95             : #elif !defined DISABLE_DYNLOADING
      96             : 
      97         603 :     initDictionaryData( lang );
      98             : 
      99             : #else
     100             : 
     101             :     if( strcmp( lang, "ja" ) == 0 ) {
     102             :         data.existMark = getExistMark_ja();
     103             :         data.index1 = getIndex1_ja();
     104             :         data.index2 = getIndex2_ja();
     105             :         data.lenArray = getLenArray_ja();
     106             :         data.dataArea = getDataArea_ja();
     107             :     }
     108             :     else if( strcmp( lang, "zh" ) == 0 ) {
     109             :         data.existMark = getExistMark_zh();
     110             :         data.index1 = getIndex1_zh();
     111             :         data.index2 = getIndex2_zh();
     112             :         data.lenArray = getLenArray_zh();
     113             :         data.dataArea = getDataArea_zh();
     114             :     }
     115             : 
     116             : #endif
     117             : 
     118       19899 :     for (sal_Int32 i = 0; i < CACHE_MAX; i++)
     119       19296 :         cache[i].size = 0;
     120             : 
     121         603 :     japaneseWordBreak = false;
     122         603 : }
     123             : 
     124        1206 : xdictionary::~xdictionary()
     125             : {
     126       19899 :     for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
     127       19296 :         if (cache[i].size > 0) {
     128           5 :             delete [] cache[i].contents;
     129           5 :             delete [] cache[i].wordboundary;
     130             :         }
     131             :     }
     132         603 : }
     133             : 
     134             : namespace {
     135          30 :     struct datacache {
     136             :         oslModule       mhModule;
     137             :         OString         maLang;
     138             :         xdictionarydata maData;
     139             :     };
     140             : }
     141             : 
     142             : #if !defined(DICT_JA_ZH_IN_DATAFILE) && !defined(DISABLE_DYNLOADING)
     143             : 
     144         603 : void xdictionary::initDictionaryData(const sal_Char *pLang)
     145             : {
     146             :     // Global cache, never released for performance
     147         603 :     static std::vector< datacache > aLoadedCache;
     148             : 
     149         603 :     osl::MutexGuard aGuard( osl::Mutex::getGlobalMutex() );
     150         604 :     for( size_t i = 0; i < aLoadedCache.size(); ++i )
     151             :     {
     152         597 :         if( !strcmp( pLang, aLoadedCache[ i ].maLang.getStr() ) )
     153             :         {
     154         596 :             data = aLoadedCache[ i ].maData;
     155        1199 :             return;
     156             :         }
     157             :     }
     158             : 
     159             :     // otherwise add to the cache, positive or negative.
     160          14 :     datacache aEntry;
     161           7 :     aEntry.maLang = OString( pLang, strlen( pLang ) );
     162             : 
     163             : #ifdef SAL_DLLPREFIX
     164          14 :     OUStringBuffer aBuf( sal::static_int_cast<int>(strlen(pLang) + 7 + 6) );    // mostly "lib*.so" (with * == dict_zh)
     165           7 :     aBuf.appendAscii( SAL_DLLPREFIX );
     166             : #else
     167             :     OUStringBuffer aBuf( sal::static_int_cast<int>(strlen(pLang) + 7 + 4) );    // mostly "*.dll" (with * == dict_zh)
     168             : #endif
     169           7 :     aBuf.appendAscii( "dict_" ).appendAscii( pLang ).appendAscii( SAL_DLLEXTENSION );
     170           7 :     aEntry.mhModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
     171           7 :     if( aEntry.mhModule ) {
     172             :         oslGenericFunction func;
     173           7 :         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getExistMark" );
     174           7 :         aEntry.maData.existMark = reinterpret_cast<sal_uInt8 const * (*)()>(func)();
     175           7 :         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex1" );
     176           7 :         aEntry.maData.index1 = reinterpret_cast<sal_Int16 const * (*)()>(func)();
     177           7 :         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getIndex2" );
     178           7 :         aEntry.maData.index2 = reinterpret_cast<sal_Int32 const * (*)()>(func)();
     179           7 :         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getLenArray" );
     180           7 :         aEntry.maData.lenArray = reinterpret_cast<sal_Int32 const * (*)()>(func)();
     181           7 :         func = osl_getAsciiFunctionSymbol( aEntry.mhModule, "getDataArea" );
     182           7 :         aEntry.maData.dataArea = reinterpret_cast<sal_Unicode const * (*)()>(func)();
     183             :     }
     184             : 
     185           7 :     data = aEntry.maData;
     186          14 :     aLoadedCache.push_back( aEntry );
     187             : }
     188             : 
     189             : #endif
     190             : 
     191           2 : void xdictionary::setJapaneseWordBreak()
     192             : {
     193           2 :     japaneseWordBreak = true;
     194           2 : }
     195             : 
     196         101 : bool xdictionary::exists(const sal_uInt32 c)
     197             : {
     198             :     // 0x1FFF is the hardcoded limit in gendict for data.existMarks
     199         101 :     bool exist = data.existMark && (c>>3) < 0x1FFF && (data.existMark[c>>3] & (1<<(c&0x07))) != 0;
     200         101 :     if (!exist && japaneseWordBreak)
     201           0 :         return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
     202             :     else
     203         101 :         return exist;
     204             : }
     205             : 
     206          14 : sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen)
     207             : {
     208          14 :     if ( !data.index1 ) return 0;
     209             : 
     210          14 :     sal_Int16 idx = data.index1[str[0] >> 8];
     211             : 
     212          14 :     if (idx == 0xFF) return 0;
     213             : 
     214          14 :     idx = (idx<<8) | (str[0]&0xff);
     215             : 
     216          14 :     sal_uInt32 begin = data.index2[idx], end = data.index2[idx+1];
     217             : 
     218          14 :     if (begin == 0) return 0;
     219             : 
     220          14 :     str++; sLen--; // first character is not stored in the dictionary
     221        2200 :     for (sal_uInt32 i = end; i > begin; i--) {
     222        2195 :         sal_Int32 len = data.lenArray[i] - data.lenArray[i - 1];
     223        2195 :         if (sLen >= len) {
     224         884 :             const sal_Unicode *dstr = data.dataArea + data.lenArray[i-1];
     225         884 :             sal_Int32 pos = 0;
     226             : 
     227         884 :             while (pos < len && dstr[pos] == str[pos]) { pos++; }
     228             : 
     229         884 :             if (pos == len)
     230           9 :                 return len + 1;
     231             :         }
     232             :     }
     233           5 :     return 0;
     234             : }
     235             : 
     236             : 
     237             : /*
     238             :  * c-tor
     239             :  */
     240             : 
     241       19296 : WordBreakCache::WordBreakCache() :
     242             :     length( 0 ),
     243             :     contents( NULL ),
     244             :     wordboundary( NULL ),
     245       19296 :     size( 0 )
     246             : {
     247       19296 : }
     248             : 
     249             : /*
     250             :  * Compare two unicode string,
     251             :  */
     252             : 
     253          18 : bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary)
     254             : {
     255             :     // Different length, different string.
     256          18 :     if (length != boundary.endPos - boundary.startPos) return false;
     257             : 
     258          61 :     for (sal_Int32 i = 0; i < length; i++)
     259          48 :         if (contents[i] != str[i + boundary.startPos]) return false;
     260             : 
     261          13 :     return true;
     262             : }
     263             : 
     264             : 
     265             : /*
     266             :  * Retrieve the segment containing the character at pos.
     267             :  * @param pos : Position of the given character.
     268             :  * @return true if CJK.
     269             :  */
     270          51 : bool xdictionary::seekSegment(const OUString &rText, sal_Int32 pos,
     271             :     Boundary& segBoundary)
     272             : {
     273             :     sal_Int32 indexUtf16;
     274             : 
     275          51 :     if (segmentCachedString.pData != rText.pData) {
     276             :         // Cache the passed text so we can avoid regenerating the segment if it's the same
     277             :         // (pData is refcounted and assigning the OUString references it, which ensures that
     278             :         // the object is the same if we get the same pointer back later)
     279           7 :         segmentCachedString = rText;
     280             :     } else {
     281             :         // If pos is within the cached boundary, use that boundary
     282          44 :         if (pos >= segmentCachedBoundary.startPos && pos <= segmentCachedBoundary.endPos) {
     283          16 :             segBoundary.startPos = segmentCachedBoundary.startPos;
     284          16 :             segBoundary.endPos = segmentCachedBoundary.endPos;
     285          16 :             indexUtf16 = segmentCachedBoundary.startPos;
     286          16 :             rText.iterateCodePoints(&indexUtf16, 1);
     287          16 :             return segmentCachedBoundary.endPos > indexUtf16;
     288             :         }
     289             :     }
     290             : 
     291          35 :     segBoundary.endPos = segBoundary.startPos = pos;
     292             : 
     293          35 :     indexUtf16 = pos;
     294          87 :     while (indexUtf16 > 0)
     295             :     {
     296          46 :         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
     297          46 :         if (u_isWhitespace(ch) || exists(ch))
     298          17 :             segBoundary.startPos = indexUtf16;
     299             :         else
     300          29 :             break;
     301             :     }
     302             : 
     303          35 :     indexUtf16 = pos;
     304         100 :     while (indexUtf16 < rText.getLength())
     305             :     {
     306          61 :         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
     307          61 :         if (u_isWhitespace(ch) || exists(ch))
     308          30 :             segBoundary.endPos = indexUtf16;
     309             :         else
     310          31 :             break;
     311             :     }
     312             : 
     313             :     // Cache the calculated boundary
     314          35 :     segmentCachedBoundary.startPos = segBoundary.startPos;
     315          35 :     segmentCachedBoundary.endPos = segBoundary.endPos;
     316             : 
     317          35 :     indexUtf16 = segBoundary.startPos;
     318          35 :     rText.iterateCodePoints(&indexUtf16, 1);
     319          35 :     return segBoundary.endPos > indexUtf16;
     320             : }
     321             : 
     322             : #define KANJA       1
     323             : #define KATAKANA    2
     324             : #define HIRAKANA    3
     325             : 
     326           0 : static sal_Int16 JapaneseCharType(sal_Unicode c)
     327             : {
     328           0 :     if (0x3041 <= c && c <= 0x309e)
     329           0 :         return HIRAKANA;
     330           0 :     if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
     331           0 :         return KATAKANA;
     332           0 :     return KANJA;
     333             : }
     334             : 
     335          23 : WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
     336             : {
     337          23 :     WordBreakCache& rCache = cache[text[0] & 0x1f];
     338             : 
     339          23 :     if (rCache.size != 0 && rCache.equals(text, wordBoundary))
     340          13 :         return rCache;
     341             : 
     342          10 :     sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
     343             : 
     344          10 :     if (rCache.size == 0 || len > rCache.size) {
     345           5 :         if (rCache.size != 0) {
     346           0 :             delete [] rCache.contents;
     347           0 :             delete [] rCache.wordboundary;
     348           0 :             rCache.size = len;
     349             :         }
     350             :         else
     351           5 :             rCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
     352           5 :         rCache.contents = new sal_Unicode[rCache.size + 1];
     353           5 :         rCache.wordboundary = new sal_Int32[rCache.size + 2];
     354             :     }
     355          10 :     rCache.length  = len;
     356          10 :     memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
     357          10 :     *(rCache.contents + len) = 0x0000;
     358             :     // reset the wordboundary in cache
     359          10 :     memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
     360             : 
     361          10 :     sal_Int32 i = 0;        // loop variable
     362          37 :     while (rCache.wordboundary[i] < rCache.length) {
     363          17 :         len = 0;
     364             :         // look the continuous white space as one word and cashe it
     365          40 :         while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + rCache.wordboundary[i] + len]))
     366           6 :             len ++;
     367             : 
     368          17 :         if (len == 0) {
     369          14 :             const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
     370          14 :             sal_Int32 slen = rCache.length - rCache.wordboundary[i];
     371          14 :             sal_Int16 type = 0, count = 0;
     372          28 :             for (;len == 0 && slen > 0; str++, slen--) {
     373          14 :                 len = getLongestMatch(str, slen);
     374          14 :                 if (len == 0) {
     375           5 :                     if (!japaneseWordBreak) {
     376           5 :                         len = 1;
     377             :                     } else {
     378           0 :                         if (count == 0)
     379           0 :                             type = JapaneseCharType(*str);
     380           0 :                         else if (type != JapaneseCharType(*str))
     381           0 :                             break;
     382           0 :                         count++;
     383             :                     }
     384             :                 }
     385             :             }
     386          14 :             if (count)
     387             :             {
     388           0 :                 rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
     389           0 :                 i++;
     390             :             }
     391             :         }
     392             : 
     393          17 :         if (len) {
     394          17 :             rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
     395          17 :             i++;
     396             :         }
     397             :     }
     398          10 :     rCache.wordboundary[i + 1] = rCache.length + 1;
     399             : 
     400          10 :     return rCache;
     401             : }
     402             : 
     403           0 : Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
     404             : {
     405             :         // looking for the first non-whitespace character from anyPos
     406           0 :         sal_uInt32 ch = 0;
     407           0 :         if (anyPos > 0)
     408           0 :             rText.iterateCodePoints(&anyPos, -1);
     409             : 
     410           0 :         while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
     411             : 
     412           0 :         return getWordBoundary(rText, anyPos, wordType, true);
     413             : }
     414             : 
     415           0 : Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
     416             : {
     417           0 :         boundary = getWordBoundary(rText, anyPos, wordType, true);
     418           0 :         anyPos = boundary.endPos;
     419           0 :         const sal_Int32 nLen = rText.getLength();
     420           0 :         if (anyPos < nLen) {
     421             :             // looknig for the first non-whitespace character from anyPos
     422           0 :             sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
     423           0 :             while (u_isWhitespace(ch) && (anyPos < nLen)) ch=rText.iterateCodePoints(&anyPos, 1);
     424           0 :             if (anyPos > 0)
     425           0 :                 rText.iterateCodePoints(&anyPos, -1);
     426             :         }
     427             : 
     428           0 :         return getWordBoundary(rText, anyPos, wordType, true);
     429             : }
     430             : 
     431          51 : Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, bool bDirection)
     432             : {
     433          51 :         const sal_Unicode *text=rText.getStr();
     434          51 :         sal_Int32 len=rText.getLength();
     435          51 :         if (anyPos >= len || anyPos < 0) {
     436           0 :             boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
     437          51 :         } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
     438          23 :             WordBreakCache& aCache = getCache(text, boundary);
     439          23 :             sal_Int32 i = 0;
     440             : 
     441          23 :             while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
     442             : 
     443          23 :             sal_Int32 startPos = aCache.wordboundary[i - 1];
     444             :             // if bDirection is false
     445          23 :             if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
     446             :             {
     447           0 :                 sal_Int32 indexUtf16 = anyPos-1;
     448           0 :                 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
     449           0 :                 if (u_isWhitespace(ch))
     450           0 :                     i--;
     451             :             }
     452             : 
     453          23 :             boundary.endPos = boundary.startPos;
     454          23 :             boundary.endPos += aCache.wordboundary[i];
     455          23 :             boundary.startPos += aCache.wordboundary[i-1];
     456             : 
     457             :         } else {
     458          28 :             boundary.startPos = anyPos;
     459          28 :             if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
     460          28 :             boundary.endPos = anyPos < len ? anyPos : len;
     461             :         }
     462          51 :         if (wordType == WordType::WORD_COUNT) {
     463             :             // skip punctuation for word count.
     464          91 :             while (boundary.endPos < len)
     465             :             {
     466          47 :                 sal_Int32 indexUtf16 = boundary.endPos;
     467          47 :                 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
     468           3 :                     boundary.endPos = indexUtf16;
     469             :                 else
     470          44 :                     break;
     471             :             }
     472             :         }
     473             : 
     474          51 :         return boundary;
     475             : }
     476             : 
     477             : } } } }
     478             : 
     479             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.11