LCOV - code coverage report
Current view: top level - libreoffice/i18npool/source/breakiterator - xdictionary.cxx (source / functions) Hit Total Coverage
Test: libreoffice_filtered.info Lines: 134 174 77.0 %
Date: 2012-12-27 Functions: 10 14 71.4 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  *
       9             :  * This file incorporates work covered by the following license notice:
      10             :  *
      11             :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12             :  *   contributor license agreements. See the NOTICE file distributed
      13             :  *   with this work for additional information regarding copyright
      14             :  *   ownership. The ASF licenses this file to you under the Apache
      15             :  *   License, Version 2.0 (the "License"); you may not use this file
      16             :  *   except in compliance with the License. You may obtain a copy of
      17             :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18             :  */
      19             : 
      20             : 
      21             : // xdictionary.cpp: implementation of the xdictionary class.
      22             : //
      23             : //////////////////////////////////////////////////////////////////////
      24             : 
      25             : 
      26             : #include <rtl/ustrbuf.hxx>
      27             : 
      28             : #include <com/sun/star/i18n/WordType.hpp>
      29             : #include <xdictionary.hxx>
      30             : #include <unicode/uchar.h>
      31             : #include <string.h>
      32             : #include <breakiteratorImpl.hxx>
      33             : 
      34             : //////////////////////////////////////////////////////////////////////
      35             : // Construction/Destruction
      36             : //////////////////////////////////////////////////////////////////////
      37             : 
      38             : using ::rtl::OUString;
      39             : using ::rtl::OUStringBuffer;
      40             : 
      41             : namespace com { namespace sun { namespace star { namespace i18n {
      42             : 
      43             : #ifndef DISABLE_DYNLOADING
      44             : 
      45           0 : extern "C" { static void SAL_CALL thisModule() {} }
      46             : 
      47             : #else
      48             : 
      49             : extern "C" {
      50             : 
      51             : sal_uInt8* getExistMark_ja();
      52             : sal_Int16* getIndex1_ja();
      53             : sal_Int32* getIndex2_ja();
      54             : sal_Int32* getLenArray_ja();
      55             : sal_Unicode* getDataArea_ja();
      56             : 
      57             : sal_uInt8* getExistMark_zh();
      58             : sal_Int16* getIndex1_zh();
      59             : sal_Int32* getIndex2_zh();
      60             : sal_Int32* getLenArray_zh();
      61             : sal_Unicode* getDataArea_zh();
      62             : 
      63             : }
      64             : 
      65             : #endif
      66             : 
      67           3 : xdictionary::xdictionary(const sal_Char *lang) :
      68             :     existMark( NULL ),
      69             :     index1( NULL ),
      70             :     index2( NULL ),
      71             :     lenArray( NULL ),
      72             :     dataArea( NULL ),
      73             : #ifndef DISABLE_DYNLOADING
      74             :     hModule( NULL ),
      75             : #endif
      76             :     boundary(),
      77           3 :     japaneseWordBreak( sal_False )
      78             : {
      79           3 :     index1 = 0;
      80             : #ifndef DISABLE_DYNLOADING
      81             : #ifdef SAL_DLLPREFIX
      82           3 :     OUStringBuffer aBuf( strlen(lang) + 7 + 6 );    // mostly "lib*.so" (with * == dict_zh)
      83           3 :     aBuf.appendAscii( SAL_DLLPREFIX );
      84             : #else
      85             :     OUStringBuffer aBuf( strlen(lang) + 7 + 4 );    // mostly "*.dll" (with * == dict_zh)
      86             : #endif
      87           3 :     aBuf.appendAscii( "dict_" ).appendAscii( lang ).appendAscii( SAL_DLLEXTENSION );
      88           3 :         hModule = osl_loadModuleRelative( &thisModule, aBuf.makeStringAndClear().pData, SAL_LOADMODULE_DEFAULT );
      89           3 :         if( hModule ) {
      90             :             sal_IntPtr (*func)();
      91           3 :             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getExistMark").pData );
      92           3 :             existMark = (sal_uInt8*) (*func)();
      93           3 :             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getIndex1").pData );
      94           3 :             index1 = (sal_Int16*) (*func)();
      95           3 :             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getIndex2").pData );
      96           3 :             index2 = (sal_Int32*) (*func)();
      97           3 :             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getLenArray").pData );
      98           3 :             lenArray = (sal_Int32*) (*func)();
      99           3 :             func = (sal_IntPtr(*)()) osl_getFunctionSymbol( hModule, OUString("getDataArea").pData );
     100           3 :             dataArea = (sal_Unicode*) (*func)();
     101             :         }
     102             :         else
     103             :         {
     104           0 :             existMark = NULL;
     105           0 :             index1 = NULL;
     106           0 :             index2 = NULL;
     107           0 :             lenArray = NULL;
     108           0 :             dataArea = NULL;
     109             :         }
     110             : 
     111             : #else
     112             :         if( strcmp( lang, "ja" ) == 0 ) {
     113             :             existMark = getExistMark_ja();
     114             :             index1 = getIndex1_ja();
     115             :             index2 = getIndex2_ja();
     116             :             lenArray = getLenArray_ja();
     117             :             dataArea = getDataArea_ja();
     118             :         }
     119             :         else if( strcmp( lang, "zh" ) == 0 ) {
     120             :             existMark = getExistMark_zh();
     121             :             index1 = getIndex1_zh();
     122             :             index2 = getIndex2_zh();
     123             :             lenArray = getLenArray_zh();
     124             :             dataArea = getDataArea_zh();
     125             :         }
     126             :         else
     127             :         {
     128             :             existMark = NULL;
     129             :             index1 = NULL;
     130             :             index2 = NULL;
     131             :             lenArray = NULL;
     132             :             dataArea = NULL;
     133             :         }
     134             : #endif
     135             : 
     136          99 :         for (sal_Int32 i = 0; i < CACHE_MAX; i++)
     137          96 :             cache[i].size = 0;
     138             : 
     139           3 :         japaneseWordBreak = sal_False;
     140           3 : }
     141             : 
     142           3 : xdictionary::~xdictionary() {
     143             : #ifndef DISABLE_DYNLOADING
     144           3 :         osl_unloadModule(hModule);
     145             : #endif
     146          99 :         for (sal_Int32 i = 0; i < CACHE_MAX; i++) {
     147          96 :             if (cache[i].size > 0) {
     148           4 :                 delete [] cache[i].contents;
     149           4 :                 delete [] cache[i].wordboundary;
     150             :             }
     151             :         }
     152           3 : }
     153             : 
     154           1 : void xdictionary::setJapaneseWordBreak()
     155             : {
     156           1 :         japaneseWordBreak = sal_True;
     157           1 : }
     158             : 
     159         175 : sal_Bool xdictionary::exists(const sal_uInt32 c) {
     160             :         // 0x1FFF is the hardcoded limit in gendict for existMarks
     161         175 :         sal_Bool exist = (existMark && ((c>>3) < 0x1FFF)) ? sal::static_int_cast<sal_Bool>((existMark[c>>3] & (1<<(c&0x07))) != 0) : sal_False;
     162         175 :         if (!exist && japaneseWordBreak)
     163           0 :             return BreakIteratorImpl::getScriptClass(c) == ScriptType::ASIAN;
     164             :         else
     165         175 :             return exist;
     166             : }
     167             : 
     168          12 : sal_Int32 xdictionary::getLongestMatch(const sal_Unicode* str, sal_Int32 sLen) {
     169             : 
     170          12 :         if ( !index1 ) return 0;
     171             : 
     172          12 :         sal_Int16 idx = index1[str[0] >> 8];
     173             : 
     174          12 :         if (idx == 0xFF) return 0;
     175             : 
     176          12 :         idx = (idx<<8) | (str[0]&0xff);
     177             : 
     178          12 :         sal_uInt32 begin = index2[idx], end = index2[idx+1];
     179             : 
     180          12 :         if (begin == 0) return 0;
     181             : 
     182          12 :         str++; sLen--; // first character is not stored in the dictionary
     183        2115 :         for (sal_uInt32 i = end; i > begin; i--) {
     184        2110 :             sal_Int32 len = lenArray[i] - lenArray[i - 1];
     185        2110 :             if (sLen >= len) {
     186         718 :                 const sal_Unicode *dstr = dataArea + lenArray[i-1];
     187         718 :                 sal_Int32 pos = 0;
     188             : 
     189         718 :                 while (pos < len && dstr[pos] == str[pos]) { pos++; }
     190             : 
     191         718 :                 if (pos == len)
     192           7 :                     return len + 1;
     193             :             }
     194             :         }
     195           5 :         return 0;
     196             : }
     197             : 
     198             : 
     199             : /*
     200             :  * c-tor
     201             :  */
     202             : 
     203          96 : WordBreakCache::WordBreakCache() :
     204             :     length( 0 ),
     205             :     contents( NULL ),
     206             :     wordboundary( NULL ),
     207          96 :     size( 0 )
     208             : {
     209          96 : }
     210             : 
     211             : /*
     212             :  * Compare two unicode string,
     213             :  */
     214             : 
     215          21 : sal_Bool WordBreakCache::equals(const sal_Unicode* str, Boundary& boundary) {
     216             :         // Different length, different string.
     217          21 :         if (length != boundary.endPos - boundary.startPos) return sal_False;
     218             : 
     219          66 :         for (sal_Int32 i = 0; i < length; i++)
     220          50 :             if (contents[i] != str[i + boundary.startPos]) return sal_False;
     221             : 
     222          16 :         return sal_True;
     223             : }
     224             : 
     225             : 
     226             : /*
     227             :  * Retrieve the segment containing the character at pos.
     228             :  * @param pos : Position of the given character.
     229             :  * @return true if CJK.
     230             :  */
     231          56 : sal_Bool xdictionary::seekSegment(const rtl::OUString &rText, sal_Int32 pos,
     232             :     Boundary& segBoundary)
     233             : {
     234             :     sal_Int32 indexUtf16;
     235          56 :     segBoundary.endPos = segBoundary.startPos = pos;
     236             : 
     237          56 :     indexUtf16 = pos;
     238         158 :     while (indexUtf16 > 0)
     239             :     {
     240          95 :         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, -1);
     241          95 :         if (u_isWhitespace(ch) || exists(ch))
     242          46 :             segBoundary.startPos = indexUtf16;
     243             :         else
     244          49 :             break;
     245             :     }
     246             : 
     247          56 :     indexUtf16 = pos;
     248         153 :     while (indexUtf16 < rText.getLength())
     249             :     {
     250          94 :         sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
     251          94 :         if (u_isWhitespace(ch) || exists(ch))
     252          41 :             segBoundary.endPos = indexUtf16;
     253             :         else
     254          53 :             break;
     255             :     }
     256             : 
     257          56 :     indexUtf16 = segBoundary.startPos;
     258          56 :     rText.iterateCodePoints(&indexUtf16, 1);
     259          56 :     return segBoundary.endPos > indexUtf16;
     260             : }
     261             : 
     262             : #define KANJA       1
     263             : #define KATAKANA    2
     264             : #define HIRAKANA    3
     265             : 
     266           0 : static sal_Int16 JapaneseCharType(sal_Unicode c)
     267             : {
     268           0 :     if (0x3041 <= c && c <= 0x309e)
     269           0 :         return HIRAKANA;
     270           0 :     if ((0x30a1 <= c && c <= 0x30fe) || (0xff65 <= c && c <= 0xff9f))
     271           0 :         return KATAKANA;
     272           0 :     return KANJA;
     273             : }
     274             : 
     275          25 : WordBreakCache& xdictionary::getCache(const sal_Unicode *text, Boundary& wordBoundary)
     276             : {
     277          25 :     WordBreakCache& rCache = cache[text[0] & 0x1f];
     278             : 
     279          25 :     if (rCache.size != 0 && rCache.equals(text, wordBoundary))
     280          16 :         return rCache;
     281             : 
     282           9 :     sal_Int32 len = wordBoundary.endPos - wordBoundary.startPos;
     283             : 
     284           9 :     if (rCache.size == 0 || len > rCache.size) {
     285           4 :         if (rCache.size != 0) {
     286           0 :             delete rCache.contents;
     287           0 :             delete rCache.wordboundary;
     288           0 :             rCache.size = len;
     289             :         }
     290             :         else
     291           4 :             rCache.size = len > DEFAULT_SIZE ? len : DEFAULT_SIZE;
     292           4 :         rCache.contents = new sal_Unicode[rCache.size + 1];
     293           4 :         rCache.wordboundary = new sal_Int32[rCache.size + 2];
     294             :     }
     295           9 :     rCache.length  = len;
     296           9 :     memcpy(rCache.contents, text + wordBoundary.startPos, len * sizeof(sal_Unicode));
     297           9 :     *(rCache.contents + len) = 0x0000;
     298             :     // reset the wordboundary in cache
     299           9 :     memset(rCache.wordboundary, '\0', sizeof(sal_Int32)*(len + 2));
     300             : 
     301           9 :     sal_Int32 i = 0;        // loop variable
     302          33 :     while (rCache.wordboundary[i] < rCache.length) {
     303          15 :         len = 0;
     304             :         // look the continuous white space as one word and cashe it
     305          36 :         while (u_isWhitespace((sal_uInt32)text[wordBoundary.startPos + rCache.wordboundary[i] + len]))
     306           6 :             len ++;
     307             : 
     308          15 :         if (len == 0) {
     309          12 :             const sal_Unicode *str = text + wordBoundary.startPos + rCache.wordboundary[i];
     310          12 :             sal_Int32 slen = rCache.length - rCache.wordboundary[i];
     311          12 :             sal_Int16 type = 0, count = 0;
     312          24 :             for (;len == 0 && slen > 0; str++, slen--) {
     313          12 :                 len = getLongestMatch(str, slen);
     314          12 :                 if (len == 0) {
     315           5 :                     if (!japaneseWordBreak) {
     316           5 :                         len = 1;
     317             :                     } else {
     318           0 :                         if (count == 0)
     319           0 :                             type = JapaneseCharType(*str);
     320           0 :                         else if (type != JapaneseCharType(*str))
     321           0 :                             break;
     322           0 :                         count++;
     323             :                     }
     324             :                 }
     325             :             }
     326          12 :             if (count)
     327             :             {
     328           0 :                 rCache.wordboundary[i+1] = rCache.wordboundary[i] + count;
     329           0 :                 i++;
     330             :             }
     331             :         }
     332             : 
     333          15 :         if (len) {
     334          15 :             rCache.wordboundary[i+1] = rCache.wordboundary[i] + len;
     335          15 :             i++;
     336             :         }
     337             :     }
     338           9 :     rCache.wordboundary[i + 1] = rCache.length + 1;
     339             : 
     340           9 :     return rCache;
     341             : }
     342             : 
     343           0 : Boundary xdictionary::previousWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
     344             : {
     345             :         // looking for the first non-whitespace character from anyPos
     346           0 :         sal_uInt32 ch = rText.iterateCodePoints(&anyPos, -1);
     347             : 
     348           0 :         while (anyPos > 0 && u_isWhitespace(ch)) ch = rText.iterateCodePoints(&anyPos, -1);
     349             : 
     350           0 :         return getWordBoundary(rText, anyPos, wordType, true);
     351             : }
     352             : 
     353           0 : Boundary xdictionary::nextWord(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType)
     354             : {
     355           0 :         boundary = getWordBoundary(rText, anyPos, wordType, true);
     356           0 :         anyPos = boundary.endPos;
     357           0 :         if (anyPos < rText.getLength()) {
     358             :             // looknig for the first non-whitespace character from anyPos
     359           0 :             sal_uInt32 ch = rText.iterateCodePoints(&anyPos, 1);
     360           0 :             while (u_isWhitespace(ch)) ch=rText.iterateCodePoints(&anyPos, 1);
     361           0 :             rText.iterateCodePoints(&anyPos, -1);
     362             :         }
     363             : 
     364           0 :         return getWordBoundary(rText, anyPos, wordType, true);
     365             : }
     366             : 
     367          56 : Boundary xdictionary::getWordBoundary(const OUString& rText, sal_Int32 anyPos, sal_Int16 wordType, sal_Bool bDirection)
     368             : {
     369          56 :         const sal_Unicode *text=rText.getStr();
     370          56 :         sal_Int32 len=rText.getLength();
     371          56 :         if (anyPos >= len || anyPos < 0) {
     372           0 :             boundary.startPos = boundary.endPos = anyPos < 0 ? 0 : len;
     373          56 :         } else if (seekSegment(rText, anyPos, boundary)) {          // character in dict
     374          25 :             WordBreakCache& aCache = getCache(text, boundary);
     375          25 :             sal_Int32 i = 0;
     376             : 
     377          25 :             while (aCache.wordboundary[i] <= anyPos - boundary.startPos) i++;
     378             : 
     379          25 :             sal_Int32 startPos = aCache.wordboundary[i - 1];
     380             :             // if bDirection is false
     381          25 :             if (!bDirection && startPos > 0 && startPos == (anyPos - boundary.startPos))
     382             :             {
     383           0 :                 sal_Int32 indexUtf16 = anyPos-1;
     384           0 :                 sal_uInt32 ch = rText.iterateCodePoints(&indexUtf16, 1);
     385           0 :                 if (u_isWhitespace(ch))
     386           0 :                     i--;
     387             :             }
     388          25 :             boundary.endPos = boundary.startPos;
     389          25 :             rText.iterateCodePoints(&boundary.endPos, aCache.wordboundary[i]);
     390          25 :             rText.iterateCodePoints(&boundary.startPos, aCache.wordboundary[i-1]);
     391             :         } else {
     392          31 :             boundary.startPos = anyPos;
     393          31 :             if (anyPos < len) rText.iterateCodePoints(&anyPos, 1);
     394          31 :             boundary.endPos = anyPos < len ? anyPos : len;
     395             :         }
     396          56 :         if (wordType == WordType::WORD_COUNT) {
     397             :             // skip punctuation for word count.
     398         101 :             while (boundary.endPos < len)
     399             :             {
     400          51 :                 sal_Int32 indexUtf16 = boundary.endPos;
     401          51 :                 if (u_ispunct(rText.iterateCodePoints(&indexUtf16, 1)))
     402           3 :                     boundary.endPos = indexUtf16;
     403             :                 else
     404             :                     break;
     405             :             }
     406             :         }
     407             : 
     408          56 :         return boundary;
     409             : }
     410             : 
     411             : } } } }
     412             : 
     413             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.10