LCOV - code coverage report
Current view: top level - i18npool/source/search - levdis.cxx (source / functions) Hit Total Coverage
Test: commit c8344322a7af75b84dd3ca8f78b05543a976dfd5 Lines: 0 183 0.0 %
Date: 2015-06-13 12:38:46 Functions: 0 12 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  *
       9             :  * This file incorporates work covered by the following license notice:
      10             :  *
      11             :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12             :  *   contributor license agreements. See the NOTICE file distributed
      13             :  *   with this work for additional information regarding copyright
      14             :  *   ownership. The ASF licenses this file to you under the Apache
      15             :  *   License, Version 2.0 (the "License"); you may not use this file
      16             :  *   except in compliance with the License. You may obtain a copy of
      17             :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18             :  */
      19             : 
      20             : /*
      21             : 
      22             :     Weighted Levenshtein Distance
      23             :     including wildcards
      24             :     '*' for any number (0 or more) of arbitrary characters
      25             :     '?' for exactly one arbitrary character
      26             :     escapeable with  backslash, "\*" or "\?"
      27             : 
      28             :     Return:
      29             :         WLD if WLD <= nLimit, else nLimit+1
      30             : 
      31             :     or, if bSplitCount:
      32             :         WLD if WLD <= nLimit
      33             :         -WLD if Replace and Insert and Delete <= nLimit
      34             :         else nLimit+1
      35             : 
      36             :     Recursive definition of WLD:
      37             : 
      38             :     WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
      39             :                              WLD( X(i)  , Y(j-1) ) + q      ,
      40             :                              WLD( X(i-1), Y(j)   ) + r      )
      41             : 
      42             :     X(i)   := the first i characters of the word X
      43             :     Y(j)   := the first j characters of the word Y
      44             :     p(i,j) := 0 if i-th character of X == j-th character of Y,
      45             :               p else
      46             : 
      47             :     Boundary conditions:
      48             :     WLD( X(0), Y(j) ) := j*q  (Y created by j inserts)
      49             :     WLD( X(i), Y(0) ) := i*r  (Y created by i deletes)
      50             :     WLD( X(0), Y(0) ) := 0
      51             : 
      52             :     Instead of recursions a dynamic algorithm is used.
      53             : 
      54             :     See also: German computer magazine
      55             :     c't 07/89 pages 192-208 and c't 03/94 pages 230-239
      56             : */
      57             : 
      58             : #include <string.h>
      59             : 
      60             : #if defined( _MSC_VER )
      61             : #pragma warning(once: 4068)
      62             : #endif
      63             : 
      64             : #include "levdis.hxx"
      65             : 
      66             : #ifdef SOLARIS
      67             : #undef min
      68             : #endif
      69             : 
      70             : #define LEVDISBIG   (nLimit + 1)    // Return value if distance > nLimit
      71             : #define LEVDISDOUBLEBUF 2048        // no doubling atop this border
      72             : 
      73           0 : static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr )
      74             : {
      75           0 :     const sal_Unicode* pTempStr = pStr;
      76           0 :     while( *pTempStr )
      77           0 :         pTempStr++;
      78           0 :     return (sal_Int32)(pTempStr-pStr);
      79             : }
      80             : 
      81             : // Distance from string to pattern
      82           0 : int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen )
      83             : {
      84           0 :     int nSPMin = 0;     // penalty point Minimum
      85           0 :     int nRepS = 0;      // for SplitCount
      86             : 
      87             :     // length difference between pattern and string
      88           0 :     int nLenDiff = nPatternLen - nStars - nStringLen;
      89             :     // more insertions or deletions necessary as the limit? Then leave
      90           0 :     if ( (nLenDiff * nInsQ0 > nLimit)
      91           0 :             || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) )
      92           0 :         return LEVDISBIG;
      93             : 
      94             :      // comparative String greater than  instantaneous array
      95             :     // -> adapt array size
      96           0 :     if ( nStringLen >= nArrayLen )
      97             :     {
      98             :         // increase size much more to avoid reallocation
      99           0 :         if ( nStringLen < LEVDISDOUBLEBUF )
     100           0 :             nArrayLen = 2 * nStringLen;
     101             :         else
     102           0 :             nArrayLen = nStringLen + 1;
     103           0 :         npDistance = aDisMem.NewMem( nArrayLen );
     104             :     }
     105             : 
     106             :     // Calculate start values of the second column (first pattern value).
     107             :     // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
     108             :     // therefore the minimum is 0
     109           0 :     if ( nPatternLen == 0 )
     110             :     {
     111             :         // Count of deletions to reach pattern
     112           0 :         for ( sal_Int32 i=0; i <= nStringLen; i++ )
     113           0 :             npDistance[i] = i * nDelR0;
     114             :     }
     115           0 :     else if ( cpPattern[0] == '*' && bpPatIsWild[0] )
     116             :     {
     117             :         // instead of a '*' you can fit in anything
     118           0 :         for ( sal_Int32 i=0; i <= nStringLen; i++ )
     119           0 :             npDistance[i] = 0;
     120             :     }
     121             :     else
     122             :     {
     123             :         sal_Unicode c;
     124             :         int nP;
     125           0 :         c = cpPattern[0];
     126           0 :         if ( c == '?' && bpPatIsWild[0] )
     127           0 :             nP = 0;     // a '?' could be any character.
     128             :         else
     129             :             // Minimum of replacement and deletion+insertion weighting
     130           0 :             nP = Min3( nRepP0, nRepP0, nDelR0 + nInsQ0 );
     131           0 :         npDistance[0] = nInsQ0;     // start with simple insert
     132           0 :         npDistance[1] = nInsQ0;
     133           0 :         npDistance[2] = nInsQ0;
     134           0 :         int nReplacePos = -1;       // tristate flag
     135           0 :         int nDelCnt = 0;
     136           0 :         for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 )
     137             :         {
     138           0 :             if ( cString[i-1] == c )
     139           0 :                 nP = 0;     // Replace from this position is 0
     140             :             // Deletions to match pattern + Replace
     141           0 :             npDistance[i] = nDelCnt + nP;
     142           0 :             if ( bSplitCount )
     143             :             {
     144           0 :                 if ( nReplacePos < 0 && nP )
     145             :                 {   // this position will be replaced
     146           0 :                     nRepS++;
     147           0 :                     nReplacePos = i;
     148             :                 }
     149           0 :                 else if ( nReplacePos > 0 && !nP )
     150             :                 {
     151             :                     // same count of c
     152           0 :                     int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen );
     153           0 :                     if ( !nBalance )
     154             :                     {   // one was replaced that was an insertion instead
     155           0 :                         nRepS--;
     156           0 :                         nReplacePos = 0;
     157             :                     }
     158             :                 }
     159             :             }
     160             :         }
     161           0 :         nSPMin = Min3( npDistance[0], npDistance[1], npDistance[2] );
     162             :     }
     163             : 
     164             :     // calculate distance matrix
     165           0 :     sal_Int32 j = 0;        // for all columns of the pattern, till limit is not reached
     166           0 :     while ( (j < nPatternLen-1)
     167           0 :             && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) )
     168             :     {
     169             :         sal_Unicode c;
     170             :         int nP, nQ, nR, nPij, d2;
     171             : 
     172           0 :         j++;
     173           0 :         c = cpPattern[j];
     174           0 :         if ( bpPatIsWild[j] )   // '*' or '?' not escaped
     175           0 :             nP = 0;     // could be replaced without penalty
     176             :         else
     177           0 :             nP = nRepP0;
     178           0 :         if ( c == '*' && bpPatIsWild[j] )
     179             :         {
     180           0 :             nQ = 0;     // instertion and deletion without penalty
     181           0 :             nR = 0;
     182             :         }
     183             :         else
     184             :         {
     185           0 :             nQ = nInsQ0;    // usual weighting
     186           0 :             nR = nDelR0;
     187             :         }
     188           0 :         d2 = npDistance[0];
     189             :         // increase insert count to get from null string to pattern
     190           0 :         npDistance[0] = npDistance[0] + nQ;
     191           0 :         nSPMin = npDistance[0];
     192           0 :         int nReplacePos = -1;       // tristate flag
     193             :         // for each pattern column run through the string
     194           0 :         for ( sal_Int32 i=1; i <= nStringLen; i++ )
     195             :         {
     196           0 :             int d1 = d2;            // WLD( X(i-1), Y(j-1) )
     197           0 :             d2 = npDistance[i];     // WLD( X(i)  , Y(j-1) )
     198           0 :             if ( cString[i-1] == c )
     199             :             {
     200           0 :                 nPij = 0;           // p(i,j)
     201           0 :                 if ( nReplacePos < 0 )
     202             :                 {
     203             :                     // same count of c
     204           0 :                     int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
     205           0 :                     if ( !nBalance )
     206           0 :                         nReplacePos = 0;    // no replacement
     207             :                 }
     208             :             }
     209             :             else
     210           0 :                 nPij = nP;
     211             :             // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
     212             :             //                          WLD( X(i)  , Y(j-1) ) + q      ,
     213             :             //                          WLD( X(i-1), Y(j)   ) + r      )
     214           0 :             npDistance[i] = Min3( d1 + nPij, d2 + nQ, npDistance[i-1] + nR );
     215           0 :             if ( npDistance[i] < nSPMin )
     216           0 :                 nSPMin = npDistance[i];
     217           0 :             if ( bSplitCount )
     218             :             {
     219           0 :                 if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij )
     220             :                 {   // this position will be replaced
     221           0 :                     nRepS++;
     222           0 :                     nReplacePos = i;
     223             :                 }
     224           0 :                 else if ( nReplacePos > 0 && !nPij )
     225             :                 {
     226             :                     // character is equal in string and pattern
     227             :                     //
     228             :                     // If from this point:
     229             :                     // * pattern and string have the same count of this
     230             :                     //   character
     231             :                     // * and character count is the same before this position
     232             :                     // then the replace was none.
     233             :                     //
     234             :                     // Scrambled letters are recognized here and the nRepS
     235             :                     // replacement is withdrawn, whereby the double limit kicks
     236             :                     // in.
     237             : 
     238             :                     // Same count of c
     239           0 :                     int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
     240           0 :                     if ( !nBalance )
     241             :                     {   // one was replaced that was an insertion instead
     242           0 :                         nRepS--;
     243           0 :                         nReplacePos = 0;
     244             :                     }
     245             :                 }
     246             :             }
     247             :         }
     248             :     }
     249           0 :     if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) )
     250           0 :         return npDistance[nStringLen];
     251             :     else
     252             :     {
     253           0 :         if ( bSplitCount )
     254             :         {
     255           0 :             if ( nRepS && nLenDiff > 0 )
     256           0 :                 nRepS -= nLenDiff;      // Inserts were counted
     257           0 :             if ( (nSPMin <= 2 * nLimit)
     258           0 :                     && (npDistance[nStringLen] <= 2 * nLimit)
     259           0 :                     && (nRepS * nRepP0 <= nLimit) )
     260           0 :                 return -npDistance[nStringLen];
     261           0 :             return LEVDISBIG;
     262             :         }
     263           0 :         return LEVDISBIG;
     264             :     }
     265             : }
     266             : 
     267             : // Calculating      nLimit,   nReplP0,    nInsQ0,     nDelR0,     bSplitCount
     268             : // from user values           nOtherX,    nShorterY,  nLongerZ,   bRelaxed
     269           0 : int WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed )
     270             : {
     271           0 :     if ( nX < 0 ) nX = 0;       // only positive values
     272           0 :     if ( nY < 0 ) nY = 0;
     273           0 :     if ( nZ < 0 ) nZ = 0;
     274           0 :     if (0 == Min3( nX, nY, nZ ))                // at least one 0
     275             :     {
     276             :         int nMid, nMax;
     277           0 :         nMax = Max3( nX, nY, nZ );              // either 0 for three 0s or Max
     278           0 :         if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0
     279           0 :             nLimit = nMax;                      // either 0 or the only one >0
     280             :         else                                    // one is 0
     281           0 :             nLimit = LCM( nMid, nMax );
     282             :     }
     283             :     else                                        // all three of them are not 0
     284           0 :         nLimit = LCM( LCM( nX, nY ), nZ );
     285           0 :     nRepP0 = ( nX ? nLimit / nX : nLimit + 1 );
     286           0 :     nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 );
     287           0 :     nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 );
     288           0 :     bSplitCount = bRelaxed;
     289           0 :     return nLimit;
     290             : }
     291             : 
     292             : // greatest common divisior according to  Euklid (chaindivision)
     293             : // special case: 0 plus anything produces 1
     294           0 : int WLevDistance::GCD( int a, int b )
     295             : {
     296           0 :     if ( !a || !b )
     297           0 :         return 1;
     298           0 :     if ( a < 0 ) a = -a;
     299           0 :     if ( b < 0 ) b = -b;
     300           0 :     do
     301             :     {
     302           0 :         if ( a > b )
     303           0 :             a -= int(a / b) * b;
     304             :         else
     305           0 :             b -= int(b / a) * a;
     306           0 :     } while ( a && b );
     307           0 :     return( a ? a : b);
     308             : }
     309             : 
     310             : // least common multiple : a * b / GCD(a,b)
     311           0 : int WLevDistance::LCM( int a, int b )
     312             : {
     313           0 :     if ( a > b )    // decrease owerflow chance
     314           0 :         return( (a / GCD(a,b)) * b );
     315             :     else
     316           0 :         return( (b / GCD(a,b)) * a );
     317             : }
     318             : 
     319             : // Minimum of three values
     320           0 : inline int WLevDistance::Min3( int x, int y, int z )
     321             : {
     322           0 :     if ( x < y )
     323           0 :         return( x < z ? x : z );
     324             :     else
     325           0 :         return( y < z ? y : z );
     326             : }
     327             : 
     328             : // The value in the middle
     329           0 : int WLevDistance::Mid3( int x, int y, int z )
     330             : {
     331           0 :     int min = Min3(x,y,z);
     332           0 :     if ( x == min )
     333           0 :         return( y < z ? y : z);
     334           0 :     else if ( y == min )
     335           0 :         return( x < z ? x : z);
     336             :     else        // z == min
     337           0 :         return( x < y ? x : y);
     338             : }
     339             : 
     340             : // Maximum of three values
     341           0 : int WLevDistance::Max3( int x, int y, int z )
     342             : {
     343           0 :     if ( x > y )
     344           0 :         return( x > z ? x : z );
     345             :     else
     346           0 :         return( y > z ? y : z );
     347             : }
     348             : 
     349             : // initialize data from CTOR
     350           0 : void WLevDistance::InitData( const sal_Unicode* cPattern )
     351             : {
     352           0 :     cpPattern = aPatMem.GetcPtr();
     353           0 :     bpPatIsWild = aPatMem.GetbPtr();
     354           0 :     npDistance = aDisMem.GetPtr();
     355           0 :     nStars = 0;
     356           0 :     const sal_Unicode* cp1 = cPattern;
     357           0 :     sal_Unicode* cp2 = cpPattern;
     358           0 :     bool* bp = bpPatIsWild;
     359             :     // copy pattern, count asterisks, escaped Jokers
     360           0 :     while ( *cp1 )
     361             :     {
     362           0 :         if ( *cp1 == '\\' )     // maybe escaped
     363             :         {
     364           0 :             if ( *(cp1+1) == '*' || *(cp1+1) == '?' )   // next Joker?
     365             :             {
     366           0 :                 cp1++;          // skip '\\'
     367           0 :                 nPatternLen--;
     368             :             }
     369           0 :             *bp++ = false;
     370             :         }
     371           0 :         else if ( *cp1 == '*' || *cp1 == '?' )      // Joker
     372             :         {
     373           0 :             if ( *cp1 == '*' )
     374           0 :                 nStars++;
     375           0 :             *bp++ = true;
     376             :         }
     377             :         else
     378           0 :             *bp++ = false;
     379           0 :         *cp2++ = *cp1++;
     380             :     }
     381           0 :     *cp2 = '\0';
     382           0 : }
     383             : 
     384           0 : WLevDistance::WLevDistance( const sal_Unicode* cPattern,
     385             :                             int nOtherX, int nShorterY, int nLongerZ,
     386             :                             bool bRelaxed ) :
     387           0 :     nPatternLen( Impl_WLD_StringLen(cPattern) ),
     388             :     aPatMem( nPatternLen + 1 ),
     389           0 :     nArrayLen( nPatternLen + 1 ),
     390           0 :     aDisMem( nArrayLen )
     391             : {
     392           0 :     InitData( cPattern );
     393           0 :     CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed );
     394           0 : }
     395             : 
     396             : // CopyCTor
     397           0 : WLevDistance::WLevDistance( const WLevDistance& rWLD ) :
     398             :     nPatternLen( rWLD.nPatternLen ),
     399             :     aPatMem( nPatternLen + 1 ),
     400           0 :     nArrayLen( nPatternLen + 1 ),
     401             :     aDisMem( nArrayLen ),
     402             :     nLimit( rWLD.nLimit ),
     403             :     nRepP0( rWLD.nRepP0 ),
     404             :     nInsQ0( rWLD.nInsQ0 ),
     405             :     nDelR0( rWLD.nDelR0 ),
     406             :     nStars( rWLD.nStars ),
     407           0 :     bSplitCount( rWLD.bSplitCount )
     408             : {
     409           0 :     cpPattern = aPatMem.GetcPtr();
     410           0 :     bpPatIsWild = aPatMem.GetbPtr();
     411           0 :     npDistance = aDisMem.GetPtr();
     412             :     sal_Int32 i;
     413           0 :     for ( i=0; i<nPatternLen; i++ )
     414             :     {
     415           0 :         cpPattern[i] = rWLD.cpPattern[i];
     416           0 :         bpPatIsWild[i] = rWLD.bpPatIsWild[i];
     417             :     }
     418           0 :     cpPattern[i] = '\0';
     419           0 : }
     420             : 
     421             : // DTor
     422           0 : WLevDistance::~WLevDistance()
     423             : {
     424           0 : }
     425             : 
     426             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.11