Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #ifndef INCLUDED_I18NPOOL_SOURCE_SEARCH_LEVDIS_HXX
21 : #define INCLUDED_I18NPOOL_SOURCE_SEARCH_LEVDIS_HXX
22 :
23 : #include <rtl/ustring.hxx>
24 :
25 : // Sensible default values for a user interface could be:
26 : // LEVDISDEFAULT_XOTHER 2
27 : // Maximum X replacements to match query, found data may be different by X
28 : // characters from query.
29 : // LEVDISDEFAULT_YSHORTER 1
30 : // Maximum Y insertions to match query, found data may be Y characters
31 : // shorter than query.
32 : // LEVDISDEFAULT_ZLONGER 3
33 : // Maximum Z deletions to match query, found data may be Z characters
34 : // longer than query.
35 : // LEVDISDEFAULT_RELAXED TRUE
36 : // Use relaxed SplitCount instead of mathematical WLD.
37 : //
38 : // Joker/wildcards ('?' and '*') of course do not count as
39 : // replacement/insertion/deletion. At a '?' a replacement is not counted, for a
40 : // '*' the found data may be any number of characters longer than the query.
41 : //
42 : // Strict mathematical WLD: EITHER maximum X replacements OR Y characters
43 : // shorter OR Z characters longer.
44 : // Relaxed SplitCount: maximum X replacements AND/OR Y character shorter AND/OR
45 : // Z characters longer. Any combination of actions is valid.
46 : //
47 : // The value range for X,Y,Z is 0..33 to keep the limit within a 16 bit signed
48 : // integer, 31*32*33 is the maximum limit, LCM(31,32,33) == 32736.
49 : //
50 : // The corresponding internal default weigh values for these user interface
51 : // values would be:
52 : // LEVDISDEFAULTLIMIT 6
53 : // Default nLimit, matches x=2, y=1, z=3, p=3, q=6, r=2
54 : // LEVDISDEFAULT_P0 3
55 : // Default nRepP0, weight of replacements.
56 : // LEVDISDEFAULT_Q0 6
57 : // Default nInsQ0, weight of insertions.
58 : // LEVDISDEFAULT_R0 2
59 : // Default nDelR0, weight of deletions.
60 :
61 : // The transformation of user input values to weighs is done using CalcLPQR().
62 : // One caveat, if the WLD reaches nLimit due to nDelR0 (i.e. data string is nZ
63 : // characters longer than pattern) then no character can be replaced any more.
64 : // This can be circumvented by increasing nX or/and nZ, but of course with the
65 : // side effect of being less strict then.. or the other solution is to use
66 : // relaxed SplitCount (see below), which is the default when using CalcLPQR().
67 : //
68 : // Attention: shorter = WLD.Insert, longer = WLD.Delete
69 : //
70 : // View and counting is always from data string to pattern, a deletion means
71 : // that something is deleted from data to match pattern.
72 : //
73 : // Deletions weigh less in this example because usually less is known than is
74 : // searched for. Replacements get middle weight, for example because of
75 : // misspellings. Insertions are expensive.
76 : //
77 : // Another example: P0 = 1, Q0 = 4, R0 = 4, Limit = 3
78 : // Allowed are maximum 4 replacements, no insertion, no deletion.
79 : // Matches the user interface values X = 3, Y = 0, Z = 0
80 : //
81 : // bSplitCount: if TRUE, Rep/Ins/Del are counted differently. The return value
82 : // of WLD() then isn't necessarily the Levenshtein-Distance, but can be
83 : // negative (-WLD) if the WLD is greater than nLimit but single values are
84 : // within the limits.
85 : // For the above default values that could mean: even if the found string is
86 : // already 2 characters longer (nLongerZ), 3 replacements (nOtherX) can be made
87 : // to reach pattern. Additionally, character swaps count as one replacement.
88 : // Mathematically completely incorrect, but meets user expectations ;-)
89 : //
90 : // Explanation: in the real WLD all actions are withdrawn from a common 100%
91 : // pool, if one gets all there's nothing left for others. With bSplitCount
92 : // replacements have their own pool.
93 :
94 :
95 : /** "Safe" memory allocation in ctor */
96 : class WLevDisPatternMem
97 : {
98 : sal_Unicode *cp;
99 : bool *bp;
100 : public:
101 0 : explicit WLevDisPatternMem( sal_Int32 s )
102 0 : : cp(new sal_Unicode[s])
103 0 : , bp(new bool[s])
104 : {
105 0 : }
106 :
107 0 : ~WLevDisPatternMem()
108 : {
109 0 : delete [] cp;
110 0 : delete [] bp;
111 0 : }
112 0 : sal_Unicode* GetcPtr() const { return cp; }
113 0 : bool* GetbPtr() const { return bp; }
114 : };
115 :
116 : class WLevDisDistanceMem
117 : {
118 : int* p;
119 : public:
120 0 : explicit WLevDisDistanceMem( size_t s )
121 0 : : p(0)
122 : {
123 0 : NewMem(s);
124 0 : }
125 0 : ~WLevDisDistanceMem() { delete [] p; }
126 0 : int* GetPtr() const { return p; }
127 0 : int* NewMem( size_t s )
128 : {
129 0 : delete [] p;
130 0 : return (p = new int[ s<3 ? 3 : s ]);
131 : }
132 : };
133 :
134 : /** Weighted Levenshtein Distance (WLD)
135 :
136 : For a more detailed explanation see documentation in
137 : i18npool/source/search/levdis.hxx
138 : */
139 : class WLevDistance
140 : {
141 : sal_Int32 nPatternLen; ///< length of pattern
142 : WLevDisPatternMem aPatMem; ///< manage allocation of pattern array
143 : sal_Unicode* cpPattern; ///< pointer to pattern array
144 : bool* bpPatIsWild; ///< pointer to bool array whether pattern is wildcard
145 : sal_Int32 nArrayLen; ///< length of distance array
146 : WLevDisDistanceMem aDisMem; ///< manage allocation of distance array
147 : int* npDistance; ///< pointer to distance array
148 : int nLimit; ///< WLD limit replacements/insertions/deletions
149 : int nRepP0; ///< replacement weigh
150 : int nInsQ0; ///< insertion weigh
151 : int nDelR0; ///< deletion weigh
152 : int nStars; ///< count of '*' wildcards in pattern
153 : bool bSplitCount; ///< if TRUE, Rep/Ins/Del are counted separately
154 :
155 : void InitData( const sal_Unicode* cPattern );
156 : static inline int Min3( int x, int y, int z ); ///< minimum value of 3 values
157 : static int Mid3( int x, int y, int z ); ///< middle value of 3 values
158 : static int Max3( int x, int y, int z ); ///< maximum value of 3 values
159 : static int GCD( int a, int b ); ///< Greatest Common Divisor
160 : static int LCM( int a, int b ); ///< Least Common Multiple
161 :
162 : public:
163 :
164 : /** CTor with user input. Internally calls CalcLPQR().
165 :
166 : After this, obtain the resulting limit using GetLimit().
167 :
168 : @param bRelaxed the mathematically incorrect method is default (TRUE)
169 : */
170 : WLevDistance( const sal_Unicode* cPattern, int nOtherX, int nShorterY,
171 : int nLongerZ, bool bRelaxed = true );
172 :
173 : WLevDistance( const WLevDistance& rWLD );
174 : ~WLevDistance();
175 :
176 : /** Calculate the Weighted Levenshtein Distance from string to pattern. */
177 : int WLD( const sal_Unicode* cString, sal_Int32 nStringLen );
178 :
179 : /** Calculate the internal weighs corresponding to the user input values.
180 : @returns nLimit for later comparison with WLD()
181 : */
182 : int CalcLPQR( int nOtherX, int nShorterY, int nLongerZ,
183 : bool bRelaxed = true );
184 :
185 0 : inline int GetLimit() const { return nLimit; }
186 : inline int GetReplaceP0() const { return nRepP0; }
187 : inline int GetInsertQ0() const { return nInsQ0; }
188 : inline int GetDeleteR0() const { return nDelR0; }
189 : inline bool GetSplit() const { return bSplitCount; }
190 : inline int SetLimit( int nNewLimit );
191 : inline int SetReplaceP0( int nNewP0 );
192 : inline int SetInsertQ0( int nNewQ0 );
193 : inline int SetDeleteR0( int nNewR0 );
194 : /** SetSplit(true) makes only sense after having called CalcLPQR() for the
195 : internal weighs! */
196 : inline bool SetSplit( bool bNewSplit );
197 :
198 : inline bool IsNormal( sal_Int32 nPos ) const { return !bpPatIsWild[nPos]; }
199 :
200 : // Calculate current balance, keep this inline for performance reasons!
201 : // c == cpPattern[jj] == cString[ii]
202 : // First seek up to found place, if the balance is still equal there then
203 : // also compare after the found place.
204 0 : int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode* cString, sal_Int32 nStringLen)
205 : {
206 0 : int nBalance = 0;
207 :
208 0 : if ( jj != ii )
209 : {
210 : sal_Int32 k;
211 0 : if ( jj > 0 )
212 0 : for ( k=0; k < jj; k++ )
213 0 : if ( cpPattern[k] == c )
214 0 : nBalance++;
215 0 : if ( ii > 0 )
216 0 : for ( k=0; k < ii; k++ )
217 0 : if ( cString[k] == c )
218 0 : nBalance--;
219 0 : if ( !nBalance )
220 : {
221 0 : for ( k=jj+1; k < nPatternLen; k++ )
222 0 : if ( cpPattern[k] == c )
223 0 : nBalance++;
224 0 : for ( k=ii+1; k < nStringLen; k++ )
225 0 : if ( cString[k] == c )
226 0 : nBalance--;
227 : }
228 : }
229 :
230 0 : return nBalance;
231 : }
232 : };
233 :
234 : inline int WLevDistance::SetLimit( int nNewLimit )
235 : {
236 : int nTmp = nLimit;
237 : nLimit = nNewLimit;
238 : return nTmp;
239 : }
240 :
241 : inline int WLevDistance::SetReplaceP0( int nNewP0 )
242 : {
243 : int nTmp = nRepP0;
244 : nRepP0 = nNewP0;
245 : return nTmp;
246 : }
247 :
248 : inline int WLevDistance::SetInsertQ0( int nNewQ0 )
249 : {
250 : int nTmp = nInsQ0;
251 : nInsQ0 = nNewQ0;
252 : return nTmp;
253 : }
254 :
255 : inline int WLevDistance::SetDeleteR0( int nNewR0 )
256 : {
257 : int nTmp = nDelR0;
258 : nDelR0 = nNewR0;
259 : return nTmp;
260 : }
261 :
262 : inline bool WLevDistance::SetSplit( bool bNewSplit )
263 : {
264 : bool bTmp = bSplitCount;
265 : bSplitCount = bNewSplit;
266 : return bTmp;
267 : }
268 :
269 : #endif
270 :
271 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|