LCOV - code coverage report
Current view: top level - i18npool/source/search - levdis.cxx (source / functions) Hit Total Coverage
Test: commit 10e77ab3ff6f4314137acd6e2702a6e5c1ce1fae Lines: 0 183 0.0 %
Date: 2014-11-03 Functions: 0 12 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  *
       9             :  * This file incorporates work covered by the following license notice:
      10             :  *
      11             :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12             :  *   contributor license agreements. See the NOTICE file distributed
      13             :  *   with this work for additional information regarding copyright
      14             :  *   ownership. The ASF licenses this file to you under the Apache
      15             :  *   License, Version 2.0 (the "License"); you may not use this file
      16             :  *   except in compliance with the License. You may obtain a copy of
      17             :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18             :  */
      19             : 
      20             : /*
      21             : 
      22             :     Weighted Levenshtein Distance
      23             :     including wildcards
      24             :     '*' for any number (0 or more) of arbitrary characters
      25             :     '?' for exactly one arbitrary character
      26             :     escapeable with  backslash, "\*" or "\?"
      27             : 
      28             :     Return:
      29             :         WLD if WLD <= nLimit, else nLimit+1
      30             : 
      31             :     or, if bSplitCount:
      32             :         WLD if WLD <= nLimit
      33             :         -WLD if Replace and Insert and Delete <= nLimit
      34             :         else nLimit+1
      35             : 
      36             :     Recursive definition of WLD:
      37             : 
      38             :     WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
      39             :                              WLD( X(i)  , Y(j-1) ) + q      ,
      40             :                              WLD( X(i-1), Y(j)   ) + r      )
      41             : 
      42             :     X(i)   := the first i characters of the word X
      43             :     Y(j)   := the first j characters of the word Y
      44             :     p(i,j) := 0 if i-th character of X == j-th character of Y,
      45             :               p else
      46             : 
      47             :     Boundary conditions:
      48             :     WLD( X(0), Y(j) ) := j*q  (Y created by j inserts)
      49             :     WLD( X(i), Y(0) ) := i*r  (Y created by i deletes)
      50             :     WLD( X(0), Y(0) ) := 0
      51             : 
      52             :     Instead of recursions a dynamic algorithm is used.
      53             : 
      54             :     See also: German computer magazine
      55             :     c't 07/89 pages 192-208 and c't 03/94 pages 230-239
      56             : */
      57             : 
      58             : #include <string.h>
      59             : 
      60             : #if defined( _MSC_VER )
      61             : #pragma warning(once: 4068)
      62             : #endif
      63             : 
      64             : #include "levdis.hxx"
      65             : 
      66             : #ifdef SOLARIS
      67             : #undef min
      68             : #endif
      69             : 
      70             : #define LEVDISBIG   (nLimit + 1)    // Return value if distance > nLimit
      71             : #define LEVDISDOUBLEBUF 2048        // no doubling atop this border
      72             : 
      73           0 : static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr )
      74             : {
      75           0 :     const sal_Unicode* pTempStr = pStr;
      76           0 :     while( *pTempStr )
      77           0 :         pTempStr++;
      78           0 :     return (sal_Int32)(pTempStr-pStr);
      79             : }
      80             : 
      81             : // Distance from string to pattern
      82           0 : int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen )
      83             : {
      84           0 :     int nSPMin = 0;     // penalty point Minimum
      85           0 :     int nRepS = 0;      // for SplitCount
      86             : 
      87             :     // length difference between pattern and string
      88           0 :     int nLenDiff = nPatternLen - nStars - nStringLen;
      89             :     // more insertions or deletions necessary as the limit? Then leave
      90           0 :     if ( (nLenDiff * nInsQ0 > nLimit)
      91           0 :             || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) )
      92           0 :         return(LEVDISBIG);
      93             : 
      94             :      // comparative String greater than  instantaneous array
      95             :     // -> adapt array size
      96           0 :     if ( nStringLen >= nArrayLen )
      97             :     {
      98             :         // increase size much more to avoid reallocation
      99           0 :         if ( nStringLen < LEVDISDOUBLEBUF )
     100           0 :             nArrayLen = 2 * nStringLen;
     101             :         else
     102           0 :             nArrayLen = nStringLen + 1;
     103           0 :         npDistance = aDisMem.NewMem( nArrayLen );
     104             :     }
     105             : 
     106             :     // calculate start values of the second column(first Pattern-value)
     107             :     // first column (0-Len Pattern) is always zero .. nStringLen * nInsQ0,
     108             :     // therefore the minimum is 0
     109           0 :     if ( nPatternLen == 0 )
     110             :     {
     111             :         // Count of deletions, to determine the Pattern
     112           0 :         for ( sal_Int32 i=0; i <= nStringLen; i++ )
     113           0 :             npDistance[i] = i * nDelR0;
     114             :     }
     115           0 :     else if ( cpPattern[0] == '*' && bpPatIsWild[0] )
     116             :     {
     117             :         // instead of a  '*' you can fit in anything
     118           0 :         for ( sal_Int32 i=0; i <= nStringLen; i++ )
     119           0 :             npDistance[i] = 0;
     120             :     }
     121             :     else
     122             :     {
     123             :         sal_Unicode c;
     124             :         int nP;
     125           0 :         c = cpPattern[0];
     126           0 :         if ( c == '?' && bpPatIsWild[0] )
     127           0 :             nP = 0;     // a '?' could be any character.
     128             :         else
     129             :             // Minimum replace and delete +insert  weighting
     130           0 :             nP = Min3( nRepP0, nRepP0, nDelR0 + nInsQ0 );
     131           0 :         npDistance[0] = nInsQ0;     // start with simple insert
     132           0 :         npDistance[1] = nInsQ0;
     133           0 :         npDistance[2] = nInsQ0;
     134           0 :         int nReplacePos = -1;       // tristate flag
     135           0 :         int nDelCnt = 0;
     136           0 :         for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 )
     137             :         {
     138           0 :             if ( cString[i-1] == c )
     139           0 :                 nP = 0;     // Replace from this postion with 0
     140             :             // Deletion to determine the Pattern + Replace
     141           0 :             npDistance[i] = nDelCnt + nP;
     142           0 :             if ( bSplitCount )
     143             :             {
     144           0 :                 if ( nReplacePos < 0 && nP )
     145             :                 {   // this Postion will be replaced
     146           0 :                     nRepS++;
     147           0 :                     nReplacePos = i;
     148             :                 }
     149           0 :                 else if ( nReplacePos > 0 && !nP )
     150             :                 {
     151             :                    // same count  c
     152           0 :                     int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen );
     153           0 :                     if ( !nBalance )
     154             :                     {   // an insert was replaced
     155           0 :                         nRepS--;
     156           0 :                         nReplacePos = 0;
     157             :                     }
     158             :                 }
     159             :             }
     160             :         }
     161           0 :         nSPMin = Min3( npDistance[0], npDistance[1], npDistance[2] );
     162             :     }
     163             : 
     164             :     // calculate distance matrix
     165           0 :     sal_Int32 j = 0;        //for all columns of the pattern, till limit is not reached
     166           0 :     while ( (j < nPatternLen-1)
     167           0 :             && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) )
     168             :     {
     169             :         sal_Unicode c;
     170             :         int nP, nQ, nR, nPij, d1, d2;
     171             : 
     172           0 :         j++;
     173           0 :         c = cpPattern[j];
     174           0 :         if ( bpPatIsWild[j] )   // '*' or '?' not escaped
     175           0 :             nP = 0;     // could be replaced without penalty
     176             :         else
     177           0 :             nP = nRepP0;
     178           0 :         if ( c == '*' && bpPatIsWild[j] )
     179             :         {
     180           0 :             nQ = 0;     // instertion/deletion without penalty
     181           0 :             nR = 0;
     182             :         }
     183             :         else
     184             :         {
     185           0 :             nQ = nInsQ0;    //usual weighting
     186           0 :             nR = nDelR0;
     187             :         }
     188           0 :         d2 = npDistance[0];
     189             :         // increase insert count to get from null sting to pattern
     190           0 :         npDistance[0] = npDistance[0] + nQ;
     191           0 :         nSPMin = npDistance[0];
     192           0 :         int nReplacePos = -1;       // tristate Flag
     193             :         // for each pattern column run though the string
     194           0 :         for ( sal_Int32 i=1; i <= nStringLen; i++ )
     195             :         {
     196           0 :             d1 = d2;                // WLD( X(i-1), Y(j-1) )
     197           0 :             d2 = npDistance[i];     // WLD( X(i)  , Y(j-1) )
     198           0 :             if ( cString[i-1] == c )
     199             :             {
     200           0 :                 nPij = 0;           // p(i,j)
     201           0 :                 if ( nReplacePos < 0 )
     202             :                 {
     203             :                     // same quantity c
     204           0 :                     int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
     205           0 :                     if ( !nBalance )
     206           0 :                         nReplacePos = 0;    // no replacement
     207             :                 }
     208             :             }
     209             :             else
     210           0 :                 nPij = nP;
     211             :             // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
     212             :             //                          WLD( X(i)  , Y(j-1) ) + q      ,
     213             :             //                          WLD( X(i-1), Y(j)   ) + r      )
     214           0 :             npDistance[i] = Min3( d1 + nPij, d2 + nQ, npDistance[i-1] + nR );
     215           0 :             if ( npDistance[i] < nSPMin )
     216           0 :                 nSPMin = npDistance[i];
     217           0 :             if ( bSplitCount )
     218             :             {
     219           0 :                 if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij )
     220             :                 {   // this poition will be replaced
     221           0 :                     nRepS++;
     222           0 :                     nReplacePos = i;
     223             :                 }
     224           0 :                 else if ( nReplacePos > 0 && !nPij )
     225             :                 {
     226             :                     // character is equal in string and pattern
     227             :                     //
     228             :                     //If from this point:
     229             :                     //* pattern and string have the same count of this character
     230             :                     //* and character count is the same before this position
     231             :                     //the replace was none.
     232             :                     //
     233             :                     //Scrambled letters are recognized and the replace is withdrawed.
     234             :                     //Whereby the double limit comes to fuition.
     235             :                     //
     236             :                     //Same quantity c
     237           0 :                     int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
     238           0 :                     if ( !nBalance )
     239             :                     {   // insert was replaced
     240           0 :                         nRepS--;
     241           0 :                         nReplacePos = 0;
     242             :                     }
     243             :                 }
     244             :             }
     245             :         }
     246             :     }
     247           0 :     if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) )
     248           0 :         return(npDistance[nStringLen]);
     249             :     else
     250             :     {
     251           0 :         if ( bSplitCount )
     252             :         {
     253           0 :             if ( nRepS && nLenDiff > 0 )
     254           0 :                 nRepS -= nLenDiff;      // Inserts were counted
     255           0 :             if ( (nSPMin <= 2 * nLimit)
     256           0 :                     && (npDistance[nStringLen] <= 2 * nLimit)
     257           0 :                     && (nRepS * nRepP0 <= nLimit) )
     258           0 :                 return( -npDistance[nStringLen] );
     259           0 :             return(LEVDISBIG);
     260             :         }
     261           0 :         return(LEVDISBIG);
     262             :     }
     263             : }
     264             : 
     265             : // Calculating nLimit,   nReplP0,    nInsQ0,     nDelR0,     bSplitCount
     266             : // from user values           nOtherX,    nShorterY,  nLongerZ,   bRelaxed
     267           0 : int WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed )
     268             : {
     269           0 :     if ( nX < 0 ) nX = 0;       // only positive values
     270           0 :     if ( nY < 0 ) nY = 0;
     271           0 :     if ( nZ < 0 ) nZ = 0;
     272           0 :     if (0 == Min3( nX, nY, nZ ))     // at least one 0
     273             :     {
     274             :         int nMid, nMax;
     275           0 :         nMax = Max3( nX, nY, nZ );      // either 0 for three 0s or Max
     276           0 :         if ( 0 == (nMid = Mid3( nX, nY, nZ )) )     // even two 0
     277           0 :             nLimit = nMax;  // either 0 or the only one >0
     278             :         else        // one is 0
     279           0 :             nLimit = LCM( nMid, nMax );
     280             :     }
     281             :     else        // all three of them are not 0
     282           0 :         nLimit = LCM( LCM( nX, nY ), nZ );
     283           0 :     nRepP0 = ( nX ? nLimit / nX : nLimit + 1 );
     284           0 :     nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 );
     285           0 :     nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 );
     286           0 :     bSplitCount = bRelaxed;
     287           0 :     return( nLimit );
     288             : }
     289             : 
     290             : // greatest common divisior according to  Euklid (chaindivision)
     291             : // special case: 0 plus anything produces 1
     292           0 : int WLevDistance::GCD( int a, int b )
     293             : {
     294           0 :     if ( !a || !b )
     295           0 :         return 1;
     296           0 :     if ( a < 0 ) a = -a;
     297           0 :     if ( b < 0 ) b = -b;
     298           0 :     do
     299             :     {
     300           0 :         if ( a > b )
     301           0 :             a -= int(a / b) * b;
     302             :         else
     303           0 :             b -= int(b / a) * a;
     304           0 :     } while ( a && b );
     305           0 :     return( a ? a : b);
     306             : }
     307             : 
     308             : // least common multiple : a * b / GCD(a,b)
     309           0 : int WLevDistance::LCM( int a, int b )
     310             : {
     311           0 :     if ( a > b )    // decrease owerflow chance
     312           0 :         return( (a / GCD(a,b)) * b );
     313             :     else
     314           0 :         return( (b / GCD(a,b)) * a );
     315             : }
     316             : 
     317             : // Minimum of three values
     318           0 : inline int WLevDistance::Min3( int x, int y, int z )
     319             : {
     320           0 :     if ( x < y )
     321           0 :         return( x < z ? x : z );
     322             :     else
     323           0 :         return( y < z ? y : z );
     324             : }
     325             : 
     326             : // The value in the middle
     327           0 : int WLevDistance::Mid3( int x, int y, int z )
     328             : {
     329           0 :     int min = Min3(x,y,z);
     330           0 :     if ( x == min )
     331           0 :         return( y < z ? y : z);
     332           0 :     else if ( y == min )
     333           0 :         return( x < z ? x : z);
     334             :     else        // z == min
     335           0 :         return( x < y ? x : y);
     336             : }
     337             : 
     338             : // Maximum of three values
     339           0 : int WLevDistance::Max3( int x, int y, int z )
     340             : {
     341           0 :     if ( x > y )
     342           0 :         return( x > z ? x : z );
     343             :     else
     344           0 :         return( y > z ? y : z );
     345             : }
     346             : 
     347             : // initialize data from CTOR
     348           0 : void WLevDistance::InitData( const sal_Unicode* cPattern )
     349             : {
     350           0 :     cpPattern = aPatMem.GetcPtr();
     351           0 :     bpPatIsWild = aPatMem.GetbPtr();
     352           0 :     npDistance = aDisMem.GetPtr();
     353           0 :     nStars = 0;
     354           0 :     const sal_Unicode* cp1 = cPattern;
     355           0 :     sal_Unicode* cp2 = cpPattern;
     356           0 :     bool* bp = bpPatIsWild;
     357             :     // copy pattern, count asterisks, escaped Jokers
     358           0 :     while ( *cp1 )
     359             :     {
     360           0 :         if ( *cp1 == '\\' )     // maybe escaped
     361             :         {
     362           0 :             if ( *(cp1+1) == '*' || *(cp1+1) == '?' )   // next Joker?
     363             :             {
     364           0 :                 cp1++;          // skip '\\'
     365           0 :                 nPatternLen--;
     366             :             }
     367           0 :             *bp++ = false;
     368             :         }
     369           0 :         else if ( *cp1 == '*' || *cp1 == '?' )      // Joker
     370             :         {
     371           0 :             if ( *cp1 == '*' )
     372           0 :                 nStars++;
     373           0 :             *bp++ = true;
     374             :         }
     375             :         else
     376           0 :             *bp++ = false;
     377           0 :         *cp2++ = *cp1++;
     378             :     }
     379           0 :     *cp2 = '\0';
     380           0 : }
     381             : 
     382           0 : WLevDistance::WLevDistance( const sal_Unicode* cPattern,
     383             :                             int nOtherX, int nShorterY, int nLongerZ,
     384             :                             bool bRelaxed ) :
     385           0 :     nPatternLen( Impl_WLD_StringLen(cPattern) ),
     386             :     aPatMem( nPatternLen + 1 ),
     387           0 :     nArrayLen( nPatternLen + 1 ),
     388           0 :     aDisMem( nArrayLen )
     389             : {
     390           0 :     InitData( cPattern );
     391           0 :     CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed );
     392           0 : }
     393             : 
     394             : // CopyCTor
     395           0 : WLevDistance::WLevDistance( const WLevDistance& rWLD ) :
     396             :     nPatternLen( rWLD.nPatternLen ),
     397             :     aPatMem( nPatternLen + 1 ),
     398           0 :     nArrayLen( nPatternLen + 1 ),
     399             :     aDisMem( nArrayLen ),
     400             :     nLimit( rWLD.nLimit ),
     401             :     nRepP0( rWLD.nRepP0 ),
     402             :     nInsQ0( rWLD.nInsQ0 ),
     403             :     nDelR0( rWLD.nDelR0 ),
     404             :     nStars( rWLD.nStars ),
     405           0 :     bSplitCount( rWLD.bSplitCount )
     406             : {
     407           0 :     cpPattern = aPatMem.GetcPtr();
     408           0 :     bpPatIsWild = aPatMem.GetbPtr();
     409           0 :     npDistance = aDisMem.GetPtr();
     410             :     sal_Int32 i;
     411           0 :     for ( i=0; i<nPatternLen; i++ )
     412             :     {
     413           0 :         cpPattern[i] = rWLD.cpPattern[i];
     414           0 :         bpPatIsWild[i] = rWLD.bpPatIsWild[i];
     415             :     }
     416           0 :     cpPattern[i] = '\0';
     417           0 : }
     418             : 
     419             : // DTor
     420           0 : WLevDistance::~WLevDistance()
     421             : {
     422           0 : }
     423             : 
     424             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.10