LCOV - code coverage report
Current view: top level - i18npool/source/search - levdis.hxx (source / functions) Hit Total Coverage
Test: commit c8344322a7af75b84dd3ca8f78b05543a976dfd5 Lines: 0 39 0.0 %
Date: 2015-06-13 12:38:46 Functions: 0 10 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
       2             : /*
       3             :  * This file is part of the LibreOffice project.
       4             :  *
       5             :  * This Source Code Form is subject to the terms of the Mozilla Public
       6             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       7             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       8             :  *
       9             :  * This file incorporates work covered by the following license notice:
      10             :  *
      11             :  *   Licensed to the Apache Software Foundation (ASF) under one or more
      12             :  *   contributor license agreements. See the NOTICE file distributed
      13             :  *   with this work for additional information regarding copyright
      14             :  *   ownership. The ASF licenses this file to you under the Apache
      15             :  *   License, Version 2.0 (the "License"); you may not use this file
      16             :  *   except in compliance with the License. You may obtain a copy of
      17             :  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
      18             :  */
      19             : 
      20             : #ifndef INCLUDED_I18NPOOL_SOURCE_SEARCH_LEVDIS_HXX
      21             : #define INCLUDED_I18NPOOL_SOURCE_SEARCH_LEVDIS_HXX
      22             : 
      23             : #include <rtl/ustring.hxx>
      24             : 
      25             : // Sensible default values for a user interface could be:
      26             : //  LEVDISDEFAULT_XOTHER    2
      27             : //      Maximum X replacements to match query, found data may be different by X
      28             : //      characters from query.
      29             : //  LEVDISDEFAULT_YSHORTER  1
      30             : //      Maximum Y insertions to match query, found data may be Y characters
      31             : //      shorter than query.
      32             : //  LEVDISDEFAULT_ZLONGER   3
      33             : //      Maximum Z deletions to match query, found data may be Z characters
      34             : //      longer than query.
      35             : //  LEVDISDEFAULT_RELAXED   TRUE
      36             : //      Use relaxed SplitCount instead of mathematical WLD.
      37             : //
      38             : // Joker/wildcards ('?' and '*') of course do not count as
      39             : // replacement/insertion/deletion. At a '?' a replacement is not counted, for a
      40             : // '*' the found data may be any number of characters longer than the query.
      41             : //
      42             : // Strict mathematical WLD: EITHER maximum X replacements OR Y characters
      43             : // shorter OR Z characters longer.
      44             : // Relaxed SplitCount: maximum X replacements AND/OR Y character shorter AND/OR
      45             : // Z characters longer. Any combination of actions is valid.
      46             : //
      47             : // The value range for X,Y,Z is 0..33 to keep the limit within a 16 bit signed
      48             : // integer, 31*32*33 is the maximum limit, LCM(31,32,33) == 32736.
      49             : //
      50             : // The corresponding internal default weigh values for these user interface
      51             : // values would be:
      52             : //  LEVDISDEFAULTLIMIT  6
      53             : //      Default nLimit, matches x=2, y=1, z=3, p=3, q=6, r=2
      54             : //  LEVDISDEFAULT_P0    3
      55             : //      Default nRepP0, weight of replacements.
      56             : //  LEVDISDEFAULT_Q0    6
      57             : //      Default nInsQ0, weight of insertions.
      58             : //  LEVDISDEFAULT_R0    2
      59             : //      Default nDelR0, weight of deletions.
      60             : 
      61             : // The transformation of user input values to weighs is done using CalcLPQR().
      62             : // One caveat, if the WLD reaches nLimit due to nDelR0 (i.e. data string is nZ
      63             : // characters longer than pattern) then no character can be replaced any more.
      64             : // This can be circumvented by increasing nX or/and nZ, but of course with the
      65             : // side effect of being less strict then.. or the other solution is to use
      66             : // relaxed SplitCount (see below), which is the default when using CalcLPQR().
      67             : //
      68             : // Attention: shorter = WLD.Insert, longer = WLD.Delete
      69             : //
      70             : // View and counting is always from data string to pattern, a deletion means
      71             : // that something is deleted from data to match pattern.
      72             : //
      73             : // Deletions weigh less in this example because usually less is known than is
      74             : // searched for. Replacements get middle weight, for example because of
      75             : // misspellings. Insertions are expensive.
      76             : //
      77             : // Another example: P0 = 1, Q0 = 4, R0 = 4, Limit = 3
      78             : // Allowed are maximum 4 replacements, no insertion, no deletion.
      79             : // Matches the user interface values X = 3, Y = 0, Z = 0
      80             : //
      81             : // bSplitCount: if TRUE, Rep/Ins/Del are counted differently. The return value
      82             : // of WLD() then isn't necessarily the Levenshtein-Distance, but can be
      83             : // negative (-WLD) if the WLD is greater than nLimit but single values are
      84             : // within the limits.
      85             : // For the above default values that could mean: even if the found string is
      86             : // already 2 characters longer (nLongerZ), 3 replacements (nOtherX) can be made
      87             : // to reach pattern. Additionally, character swaps count as one replacement.
      88             : // Mathematically completely incorrect, but meets user expectations ;-)
      89             : //
      90             : // Explanation: in the real WLD all actions are withdrawn from a common 100%
      91             : // pool, if one gets all there's nothing left for others. With bSplitCount
      92             : // replacements have their own pool.
      93             : 
      94             : 
      95             : /** "Safe" memory allocation in ctor */
      96             : class WLevDisPatternMem
      97             : {
      98             :     sal_Unicode     *cp;
      99             :     bool            *bp;
     100             : public:
     101           0 :     explicit WLevDisPatternMem( sal_Int32 s )
     102           0 :         : cp(new sal_Unicode[s])
     103           0 :         , bp(new bool[s])
     104             :     {
     105           0 :     }
     106             : 
     107           0 :     ~WLevDisPatternMem()
     108             :     {
     109           0 :         delete [] cp;
     110           0 :         delete [] bp;
     111           0 :     }
     112           0 :     sal_Unicode* GetcPtr() const        { return cp; }
     113           0 :     bool* GetbPtr() const               { return bp; }
     114             : };
     115             : 
     116             : class WLevDisDistanceMem
     117             : {
     118             :     int*    p;
     119             : public:
     120           0 :     explicit WLevDisDistanceMem( size_t s )
     121           0 :         : p(0)
     122             :     {
     123           0 :         NewMem(s);
     124           0 :     }
     125           0 :     ~WLevDisDistanceMem()           { delete [] p; }
     126           0 :     int* GetPtr() const             { return p; }
     127           0 :     int* NewMem( size_t s )
     128             :     {
     129           0 :         delete [] p;
     130           0 :         return (p = new int[ s<3 ? 3 : s ]);
     131             :     }
     132             : };
     133             : 
     134             : /** Weighted Levenshtein Distance (WLD)
     135             : 
     136             :     For a more detailed explanation see documentation in
     137             :     i18npool/source/search/levdis.hxx
     138             :  */
     139             : class WLevDistance
     140             : {
     141             :     sal_Int32       nPatternLen;    ///< length of pattern
     142             :     WLevDisPatternMem   aPatMem;    ///< manage allocation of pattern array
     143             :     sal_Unicode*    cpPattern;      ///< pointer to pattern array
     144             :     bool*           bpPatIsWild;    ///< pointer to bool array whether pattern is wildcard
     145             :     sal_Int32       nArrayLen;      ///< length of distance array
     146             :     WLevDisDistanceMem  aDisMem;    ///< manage allocation of distance array
     147             :     int*            npDistance;     ///< pointer to distance array
     148             :     int             nLimit;         ///< WLD limit replacements/insertions/deletions
     149             :     int             nRepP0;         ///< replacement weigh
     150             :     int             nInsQ0;         ///< insertion weigh
     151             :     int             nDelR0;         ///< deletion weigh
     152             :     int             nStars;         ///< count of '*' wildcards in pattern
     153             :     bool            bSplitCount;    ///< if TRUE, Rep/Ins/Del are counted separately
     154             : 
     155             :     void InitData( const sal_Unicode* cPattern );
     156             :     static inline int Min3( int x, int y, int z ); ///< minimum value of 3 values
     157             :     static int Mid3( int x, int y, int z );        ///< middle value of 3 values
     158             :     static int Max3( int x, int y, int z );        ///< maximum value of 3 values
     159             :     static int GCD( int a, int b );                ///< Greatest Common Divisor
     160             :     static int LCM( int a, int b );                ///< Least Common Multiple
     161             : 
     162             : public:
     163             : 
     164             :     /** CTor with user input. Internally calls CalcLPQR().
     165             : 
     166             :         After this, obtain the resulting limit using GetLimit().
     167             : 
     168             :         @param  bRelaxed    the mathematically incorrect method is default (TRUE)
     169             :      */
     170             :     WLevDistance( const sal_Unicode* cPattern, int nOtherX, int nShorterY,
     171             :                     int nLongerZ, bool bRelaxed = true );
     172             : 
     173             :     WLevDistance( const WLevDistance& rWLD );
     174             :     ~WLevDistance();
     175             : 
     176             :     /** Calculate the Weighted Levenshtein Distance from string to pattern. */
     177             :     int WLD( const sal_Unicode* cString, sal_Int32 nStringLen );
     178             : 
     179             :     /** Calculate the internal weighs corresponding to the user input values.
     180             :         @returns nLimit for later comparison with WLD()
     181             :      */
     182             :     int CalcLPQR( int nOtherX, int nShorterY, int nLongerZ,
     183             :                     bool bRelaxed = true );
     184             : 
     185           0 :     inline int GetLimit() const     { return nLimit; }
     186             :     inline int GetReplaceP0() const { return nRepP0; }
     187             :     inline int GetInsertQ0() const  { return nInsQ0; }
     188             :     inline int GetDeleteR0() const  { return nDelR0; }
     189             :     inline bool GetSplit() const    { return bSplitCount; }
     190             :     inline int SetLimit( int nNewLimit );
     191             :     inline int SetReplaceP0( int nNewP0 );
     192             :     inline int SetInsertQ0( int nNewQ0 );
     193             :     inline int SetDeleteR0( int nNewR0 );
     194             :     /** SetSplit(true) makes only sense after having called CalcLPQR() for the
     195             :         internal weighs! */
     196             :     inline bool SetSplit( bool bNewSplit );
     197             : 
     198             :     inline bool IsNormal( sal_Int32 nPos ) const { return !bpPatIsWild[nPos]; }
     199             : 
     200             :     // Calculate current balance, keep this inline for performance reasons!
     201             :     // c == cpPattern[jj] == cString[ii]
     202             :     // First seek up to found place, if the balance is still equal there then
     203             :     // also compare after the found place.
     204           0 :     int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode* cString, sal_Int32 nStringLen)
     205             :     {
     206           0 :         int nBalance = 0;
     207             : 
     208           0 :         if ( jj != ii )
     209             :         {
     210             :             sal_Int32 k;
     211           0 :             if ( jj > 0 )
     212           0 :                 for ( k=0; k < jj; k++ )
     213           0 :                     if ( cpPattern[k] == c )
     214           0 :                         nBalance++;
     215           0 :             if ( ii > 0 )
     216           0 :                 for ( k=0; k < ii; k++ )
     217           0 :                     if ( cString[k] == c )
     218           0 :                         nBalance--;
     219           0 :             if ( !nBalance )
     220             :             {
     221           0 :                 for ( k=jj+1; k < nPatternLen; k++ )
     222           0 :                     if ( cpPattern[k] == c )
     223           0 :                         nBalance++;
     224           0 :                 for ( k=ii+1; k < nStringLen; k++ )
     225           0 :                     if ( cString[k] == c )
     226           0 :                         nBalance--;
     227             :             }
     228             :         }
     229             : 
     230           0 :         return nBalance;
     231             :     }
     232             : };
     233             : 
     234             : inline int WLevDistance::SetLimit( int nNewLimit )
     235             : {
     236             :     int nTmp = nLimit;
     237             :     nLimit = nNewLimit;
     238             :     return nTmp;
     239             : }
     240             : 
     241             : inline int WLevDistance::SetReplaceP0( int nNewP0 )
     242             : {
     243             :     int nTmp = nRepP0;
     244             :     nRepP0 = nNewP0;
     245             :     return nTmp;
     246             : }
     247             : 
     248             : inline int WLevDistance::SetInsertQ0( int nNewQ0 )
     249             : {
     250             :     int nTmp = nInsQ0;
     251             :     nInsQ0 = nNewQ0;
     252             :     return nTmp;
     253             : }
     254             : 
     255             : inline int WLevDistance::SetDeleteR0( int nNewR0 )
     256             : {
     257             :     int nTmp = nDelR0;
     258             :     nDelR0 = nNewR0;
     259             :     return nTmp;
     260             : }
     261             : 
     262             : inline bool WLevDistance::SetSplit( bool bNewSplit )
     263             : {
     264             :     bool bTmp = bSplitCount;
     265             :     bSplitCount = bNewSplit;
     266             :     return bTmp;
     267             : }
     268             : 
     269             : #endif
     270             : 
     271             : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Generated by: LCOV version 1.11