Line data Source code
1 : /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /*
3 : * This file is part of the LibreOffice project.
4 : *
5 : * This Source Code Form is subject to the terms of the Mozilla Public
6 : * License, v. 2.0. If a copy of the MPL was not distributed with this
7 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 : *
9 : * This file incorporates work covered by the following license notice:
10 : *
11 : * Licensed to the Apache Software Foundation (ASF) under one or more
12 : * contributor license agreements. See the NOTICE file distributed
13 : * with this work for additional information regarding copyright
14 : * ownership. The ASF licenses this file to you under the Apache
15 : * License, Version 2.0 (the "License"); you may not use this file
16 : * except in compliance with the License. You may obtain a copy of
17 : * the License at http://www.apache.org/licenses/LICENSE-2.0 .
18 : */
19 :
20 : #include "textsearch.hxx"
21 : #include "levdis.hxx"
22 : #include <com/sun/star/lang/Locale.hpp>
23 : #include <com/sun/star/lang/XMultiServiceFactory.hpp>
24 : #include <comphelper/processfactory.hxx>
25 : #include <com/sun/star/i18n/BreakIterator.hpp>
26 : #include <com/sun/star/i18n/UnicodeType.hpp>
27 : #include <com/sun/star/util/SearchFlags.hpp>
28 : #include <com/sun/star/i18n/WordType.hpp>
29 : #include <com/sun/star/i18n/ScriptType.hpp>
30 : #include <com/sun/star/i18n/CharacterIteratorMode.hpp>
31 : #include <com/sun/star/i18n/CharacterClassification.hpp>
32 : #include <com/sun/star/i18n/KCharacterType.hpp>
33 : #include <com/sun/star/i18n/Transliteration.hpp>
34 : #include <com/sun/star/registry/XRegistryKey.hpp>
35 : #include <cppuhelper/factory.hxx>
36 : #include <cppuhelper/weak.hxx>
37 :
38 : #ifdef _MSC_VER
39 : // get rid of that dumb compiler warning
40 : // identifier was truncated to '255' characters in the debug information
41 : // for STL template usage, if .pdb files are to be created
42 : #pragma warning( disable: 4786 )
43 : #endif
44 :
45 : #include <string.h>
46 :
47 : using namespace ::com::sun::star::util;
48 : using namespace ::com::sun::star::uno;
49 : using namespace ::com::sun::star::lang;
50 : using namespace ::com::sun::star::i18n;
51 : using namespace ::com::sun::star;
52 :
53 : static sal_Int32 COMPLEX_TRANS_MASK_TMP =
54 : TransliterationModules_ignoreBaFa_ja_JP |
55 : TransliterationModules_ignoreIterationMark_ja_JP |
56 : TransliterationModules_ignoreTiJi_ja_JP |
57 : TransliterationModules_ignoreHyuByu_ja_JP |
58 : TransliterationModules_ignoreSeZe_ja_JP |
59 : TransliterationModules_ignoreIandEfollowedByYa_ja_JP |
60 : TransliterationModules_ignoreKiKuFollowedBySa_ja_JP |
61 : TransliterationModules_ignoreProlongedSoundMark_ja_JP;
62 2 : static const sal_Int32 COMPLEX_TRANS_MASK = COMPLEX_TRANS_MASK_TMP | TransliterationModules_IGNORE_KANA | TransliterationModules_FULLWIDTH_HALFWIDTH;
63 2 : static const sal_Int32 SIMPLE_TRANS_MASK = ~COMPLEX_TRANS_MASK;
64 : // Above 2 transliteration is simple but need to take effect in
65 : // complex transliteration
66 :
67 3 : TextSearch::TextSearch(const Reference < XComponentContext > & rxContext)
68 : : m_xContext( rxContext )
69 : , pJumpTable( 0 )
70 : , pJumpTable2( 0 )
71 : , pRegexMatcher( NULL )
72 3 : , pWLD( 0 )
73 : {
74 3 : SearchOptions aOpt;
75 3 : aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
76 3 : aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
77 : //aOpt.Locale = ???;
78 3 : setOptions( aOpt );
79 3 : }
80 :
81 6 : TextSearch::~TextSearch()
82 : {
83 2 : delete pRegexMatcher;
84 2 : delete pWLD;
85 2 : delete pJumpTable;
86 2 : delete pJumpTable2;
87 4 : }
88 :
89 5 : void TextSearch::setOptions( const SearchOptions& rOptions ) throw( RuntimeException )
90 : {
91 5 : aSrchPara = rOptions;
92 :
93 5 : delete pRegexMatcher, pRegexMatcher = NULL;
94 5 : delete pWLD, pWLD = 0;
95 5 : delete pJumpTable, pJumpTable = 0;
96 5 : delete pJumpTable2, pJumpTable2 = 0;
97 :
98 : // Create Transliteration class
99 5 : if( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
100 : {
101 1 : if( !xTranslit.is() )
102 1 : xTranslit.set( Transliteration::create( m_xContext ) );
103 1 : xTranslit->loadModule(
104 : (TransliterationModules)( aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK ),
105 1 : aSrchPara.Locale);
106 : }
107 4 : else if( xTranslit.is() )
108 0 : xTranslit = 0;
109 :
110 : // Create Transliteration for 2<->1, 2<->2 transliteration
111 5 : if ( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
112 : {
113 0 : if( !xTranslit2.is() )
114 0 : xTranslit2.set( Transliteration::create( m_xContext ) );
115 : // Load transliteration module
116 0 : xTranslit2->loadModule(
117 : (TransliterationModules)( aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK ),
118 0 : aSrchPara.Locale);
119 : }
120 :
121 5 : if ( !xBreak.is() )
122 3 : xBreak = com::sun::star::i18n::BreakIterator::create( m_xContext );
123 :
124 5 : sSrchStr = aSrchPara.searchString;
125 :
126 : // use transliteration here
127 5 : if ( xTranslit.is() &&
128 : aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK )
129 1 : sSrchStr = xTranslit->transliterateString2String(
130 1 : aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
131 :
132 5 : if ( xTranslit2.is() &&
133 : aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK )
134 0 : sSrchStr2 = xTranslit2->transliterateString2String(
135 0 : aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
136 :
137 : // When start or end of search string is a complex script type, we need to
138 : // make sure the result boundary is not located in the middle of cell.
139 10 : checkCTLStart = (xBreak.is() && (xBreak->getScriptType(sSrchStr, 0) ==
140 10 : ScriptType::COMPLEX));
141 10 : checkCTLEnd = (xBreak.is() && (xBreak->getScriptType(sSrchStr,
142 10 : sSrchStr.getLength()-1) == ScriptType::COMPLEX));
143 :
144 5 : switch( aSrchPara.algorithmType)
145 : {
146 : case SearchAlgorithms_REGEXP:
147 2 : fnForward = &TextSearch::RESrchFrwrd;
148 2 : fnBackward = &TextSearch::RESrchBkwrd;
149 2 : RESrchPrepare( aSrchPara);
150 2 : break;
151 :
152 : case SearchAlgorithms_APPROXIMATE:
153 0 : fnForward = &TextSearch::ApproxSrchFrwrd;
154 0 : fnBackward = &TextSearch::ApproxSrchBkwrd;
155 :
156 0 : pWLD = new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
157 : aSrchPara.insertedChars, aSrchPara.deletedChars,
158 0 : 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) );
159 :
160 0 : nLimit = pWLD->GetLimit();
161 0 : break;
162 :
163 : default:
164 3 : fnForward = &TextSearch::NSrchFrwrd;
165 3 : fnBackward = &TextSearch::NSrchBkwrd;
166 3 : break;
167 : }
168 5 : }
169 :
170 0 : sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
171 : {
172 0 : sal_Int32 nRet = 0, nEnd = rOff.getLength();
173 0 : while( nRet < nEnd && nPos > rOff[ nRet ] ) ++nRet;
174 0 : return nRet;
175 : }
176 :
177 0 : sal_Bool TextSearch::isCellStart(const OUString& searchStr, sal_Int32 nPos)
178 : throw( RuntimeException )
179 : {
180 : sal_Int32 nDone;
181 0 : return nPos == xBreak->previousCharacters(searchStr, nPos+1,
182 0 : aSrchPara.Locale, CharacterIteratorMode::SKIPCELL, 1, nDone);
183 : }
184 :
185 9 : SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
186 : throw( RuntimeException )
187 : {
188 9 : SearchResult sres;
189 :
190 9 : OUString in_str(searchStr);
191 9 : sal_Int32 newStartPos = startPos;
192 9 : sal_Int32 newEndPos = endPos;
193 :
194 9 : bUsePrimarySrchStr = true;
195 :
196 9 : if ( xTranslit.is() )
197 : {
198 : // apply normal transliteration (1<->1, 1<->0)
199 8 : com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
200 8 : in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
201 :
202 : // JP 20.6.2001: also the start and end positions must be corrected!
203 8 : if( startPos )
204 0 : newStartPos = FindPosInSeq_Impl( offset, startPos );
205 :
206 8 : if( endPos < searchStr.getLength() )
207 0 : newEndPos = FindPosInSeq_Impl( offset, endPos );
208 : else
209 8 : newEndPos = in_str.getLength();
210 :
211 8 : sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
212 :
213 9 : for ( int k = 0; k < sres.startOffset.getLength(); k++ )
214 : {
215 1 : if (sres.startOffset[k])
216 0 : sres.startOffset[k] = offset[sres.startOffset[k]];
217 : // JP 20.6.2001: end is ever exclusive and then don't return
218 : // the position of the next character - return the
219 : // next position behind the last found character!
220 : // "a b c" find "b" must return 2,3 and not 2,4!!!
221 1 : if (sres.endOffset[k])
222 1 : sres.endOffset[k] = offset[sres.endOffset[k]-1] + 1;
223 8 : }
224 : }
225 : else
226 : {
227 1 : sres = (this->*fnForward)( in_str, startPos, endPos );
228 : }
229 :
230 9 : if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP)
231 : {
232 0 : SearchResult sres2;
233 :
234 0 : in_str = OUString(searchStr);
235 0 : com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
236 :
237 0 : in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );
238 :
239 0 : if( startPos )
240 0 : startPos = FindPosInSeq_Impl( offset, startPos );
241 :
242 0 : if( endPos < searchStr.getLength() )
243 0 : endPos = FindPosInSeq_Impl( offset, endPos );
244 : else
245 0 : endPos = in_str.getLength();
246 :
247 0 : bUsePrimarySrchStr = false;
248 0 : sres2 = (this->*fnForward)( in_str, startPos, endPos );
249 :
250 0 : for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
251 : {
252 0 : if (sres2.startOffset[k])
253 0 : sres2.startOffset[k] = offset[sres2.startOffset[k]-1] + 1;
254 0 : if (sres2.endOffset[k])
255 0 : sres2.endOffset[k] = offset[sres2.endOffset[k]-1] + 1;
256 : }
257 :
258 : // pick first and long one
259 0 : if ( sres.subRegExpressions == 0)
260 0 : return sres2;
261 0 : if ( sres2.subRegExpressions == 1)
262 : {
263 0 : if ( sres.startOffset[0] > sres2.startOffset[0])
264 0 : return sres2;
265 0 : else if ( sres.startOffset[0] == sres2.startOffset[0] &&
266 0 : sres.endOffset[0] < sres2.endOffset[0])
267 0 : return sres2;
268 0 : }
269 : }
270 :
271 9 : return sres;
272 : }
273 :
274 1 : SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
275 : throw(RuntimeException)
276 : {
277 1 : SearchResult sres;
278 :
279 1 : OUString in_str(searchStr);
280 1 : sal_Int32 newStartPos = startPos;
281 1 : sal_Int32 newEndPos = endPos;
282 :
283 1 : bUsePrimarySrchStr = true;
284 :
285 1 : if ( xTranslit.is() )
286 : {
287 : // apply only simple 1<->1 transliteration here
288 0 : com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
289 0 : in_str = xTranslit->transliterate( searchStr, 0, in_str.getLength(), offset );
290 :
291 : // JP 20.6.2001: also the start and end positions must be corrected!
292 0 : if( startPos < searchStr.getLength() )
293 0 : newStartPos = FindPosInSeq_Impl( offset, startPos );
294 : else
295 0 : newStartPos = in_str.getLength();
296 :
297 0 : if( endPos )
298 0 : newEndPos = FindPosInSeq_Impl( offset, endPos );
299 :
300 0 : sres = (this->*fnBackward)( in_str, newStartPos, newEndPos );
301 :
302 0 : for ( int k = 0; k < sres.startOffset.getLength(); k++ )
303 : {
304 0 : if (sres.startOffset[k])
305 0 : sres.startOffset[k] = offset[sres.startOffset[k] - 1] + 1;
306 : // JP 20.6.2001: end is ever exclusive and then don't return
307 : // the position of the next character - return the
308 : // next position behind the last found character!
309 : // "a b c" find "b" must return 2,3 and not 2,4!!!
310 0 : if (sres.endOffset[k])
311 0 : sres.endOffset[k] = offset[sres.endOffset[k]];
312 0 : }
313 : }
314 : else
315 : {
316 1 : sres = (this->*fnBackward)( in_str, startPos, endPos );
317 : }
318 :
319 1 : if ( xTranslit2.is() && aSrchPara.algorithmType != SearchAlgorithms_REGEXP )
320 : {
321 0 : SearchResult sres2;
322 :
323 0 : in_str = OUString(searchStr);
324 0 : com::sun::star::uno::Sequence <sal_Int32> offset( in_str.getLength());
325 :
326 0 : in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);
327 :
328 0 : if( startPos < searchStr.getLength() )
329 0 : startPos = FindPosInSeq_Impl( offset, startPos );
330 : else
331 0 : startPos = in_str.getLength();
332 :
333 0 : if( endPos )
334 0 : endPos = FindPosInSeq_Impl( offset, endPos );
335 :
336 0 : bUsePrimarySrchStr = false;
337 0 : sres2 = (this->*fnBackward)( in_str, startPos, endPos );
338 :
339 0 : for( int k = 0; k < sres2.startOffset.getLength(); k++ )
340 : {
341 0 : if (sres2.startOffset[k])
342 0 : sres2.startOffset[k] = offset[sres2.startOffset[k]-1]+1;
343 0 : if (sres2.endOffset[k])
344 0 : sres2.endOffset[k] = offset[sres2.endOffset[k]-1]+1;
345 : }
346 :
347 : // pick last and long one
348 0 : if ( sres.subRegExpressions == 0 )
349 0 : return sres2;
350 0 : if ( sres2.subRegExpressions == 1 )
351 : {
352 0 : if ( sres.startOffset[0] < sres2.startOffset[0] )
353 0 : return sres2;
354 0 : if ( sres.startOffset[0] == sres2.startOffset[0] &&
355 0 : sres.endOffset[0] > sres2.endOffset[0] )
356 0 : return sres2;
357 0 : }
358 : }
359 :
360 1 : return sres;
361 : }
362 :
363 : //---------------------------------------------------------------------
364 :
365 0 : bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
366 : {
367 0 : bool bRet = 1;
368 0 : if( '\x7f' != rStr[nPos])
369 : {
370 0 : if ( !xCharClass.is() )
371 0 : xCharClass = CharacterClassification::create( m_xContext );
372 0 : sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
373 0 : aSrchPara.Locale );
374 0 : if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
375 : KCharacterType::LETTER ) & nCType ) )
376 0 : bRet = 0;
377 : }
378 0 : return bRet;
379 : }
380 :
381 : // --------- helper methods for Boyer-Moore like text searching ----------
382 : // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available
383 :
384 0 : void TextSearch::MakeForwardTab()
385 : {
386 : // create the jumptable for the search text
387 0 : if( pJumpTable )
388 : {
389 0 : if( bIsForwardTab )
390 0 : return ; // the jumpTable is ok
391 0 : delete pJumpTable;
392 : }
393 0 : bIsForwardTab = true;
394 :
395 0 : sal_Int32 n, nLen = sSrchStr.getLength();
396 0 : pJumpTable = new TextSearchJumpTable;
397 :
398 0 : for( n = 0; n < nLen - 1; ++n )
399 : {
400 0 : sal_Unicode cCh = sSrchStr[n];
401 0 : sal_Int32 nDiff = nLen - n - 1;
402 0 : TextSearchJumpTable::value_type aEntry( cCh, nDiff );
403 :
404 : ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
405 0 : pJumpTable->insert( aEntry );
406 0 : if ( !aPair.second )
407 0 : (*(aPair.first)).second = nDiff;
408 : }
409 : }
410 :
411 0 : void TextSearch::MakeForwardTab2()
412 : {
413 : // create the jumptable for the search text
414 0 : if( pJumpTable2 )
415 : {
416 0 : if( bIsForwardTab )
417 0 : return ; // the jumpTable is ok
418 0 : delete pJumpTable2;
419 : }
420 0 : bIsForwardTab = true;
421 :
422 0 : sal_Int32 n, nLen = sSrchStr2.getLength();
423 0 : pJumpTable2 = new TextSearchJumpTable;
424 :
425 0 : for( n = 0; n < nLen - 1; ++n )
426 : {
427 0 : sal_Unicode cCh = sSrchStr2[n];
428 0 : sal_Int32 nDiff = nLen - n - 1;
429 :
430 0 : TextSearchJumpTable::value_type aEntry( cCh, nDiff );
431 : ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
432 0 : pJumpTable2->insert( aEntry );
433 0 : if ( !aPair.second )
434 0 : (*(aPair.first)).second = nDiff;
435 : }
436 : }
437 :
438 0 : void TextSearch::MakeBackwardTab()
439 : {
440 : // create the jumptable for the search text
441 0 : if( pJumpTable )
442 : {
443 0 : if( !bIsForwardTab )
444 0 : return ; // the jumpTable is ok
445 0 : delete pJumpTable;
446 : }
447 0 : bIsForwardTab = false;
448 :
449 0 : sal_Int32 n, nLen = sSrchStr.getLength();
450 0 : pJumpTable = new TextSearchJumpTable;
451 :
452 0 : for( n = nLen-1; n > 0; --n )
453 : {
454 0 : sal_Unicode cCh = sSrchStr[n];
455 0 : TextSearchJumpTable::value_type aEntry( cCh, n );
456 : ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
457 0 : pJumpTable->insert( aEntry );
458 0 : if ( !aPair.second )
459 0 : (*(aPair.first)).second = n;
460 : }
461 : }
462 :
463 0 : void TextSearch::MakeBackwardTab2()
464 : {
465 : // create the jumptable for the search text
466 0 : if( pJumpTable2 )
467 : {
468 0 : if( !bIsForwardTab )
469 0 : return ; // the jumpTable is ok
470 0 : delete pJumpTable2;
471 : }
472 0 : bIsForwardTab = false;
473 :
474 0 : sal_Int32 n, nLen = sSrchStr2.getLength();
475 0 : pJumpTable2 = new TextSearchJumpTable;
476 :
477 0 : for( n = nLen-1; n > 0; --n )
478 : {
479 0 : sal_Unicode cCh = sSrchStr2[n];
480 0 : TextSearchJumpTable::value_type aEntry( cCh, n );
481 : ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
482 0 : pJumpTable2->insert( aEntry );
483 0 : if ( !aPair.second )
484 0 : (*(aPair.first)).second = n;
485 : }
486 : }
487 :
488 0 : sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
489 : {
490 : TextSearchJumpTable *pJump;
491 0 : OUString sSearchKey;
492 :
493 0 : if ( bUsePrimarySrchStr ) {
494 0 : pJump = pJumpTable;
495 0 : sSearchKey = sSrchStr;
496 : } else {
497 0 : pJump = pJumpTable2;
498 0 : sSearchKey = sSrchStr2;
499 : }
500 :
501 0 : TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
502 0 : if ( iLook == pJump->end() )
503 0 : return sSearchKey.getLength();
504 0 : return (*iLook).second;
505 : }
506 :
507 :
508 : // TextSearch::NSrchFrwrd is mis-optimized on unxsoli (#i105945#)
509 0 : SearchResult TextSearch::NSrchFrwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
510 : throw(RuntimeException)
511 : {
512 0 : SearchResult aRet;
513 0 : aRet.subRegExpressions = 0;
514 :
515 0 : OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
516 :
517 0 : OUString aStr( searchStr );
518 0 : sal_Int32 nSuchIdx = aStr.getLength();
519 0 : sal_Int32 nEnde = endPos;
520 0 : if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
521 : return aRet;
522 :
523 :
524 0 : if( nEnde < sSearchKey.getLength() ) // position inside the search region ?
525 : return aRet;
526 :
527 0 : nEnde -= sSearchKey.getLength();
528 :
529 0 : if (bUsePrimarySrchStr)
530 0 : MakeForwardTab(); // create the jumptable
531 : else
532 0 : MakeForwardTab2();
533 :
534 0 : for (sal_Int32 nCmpIdx = startPos; // start position for the search
535 : nCmpIdx <= nEnde;
536 0 : nCmpIdx += GetDiff( aStr[nCmpIdx + sSearchKey.getLength()-1]))
537 : {
538 : // if the match would be the completed cells, skip it.
539 0 : if ( (checkCTLStart && !isCellStart( aStr, nCmpIdx )) || (checkCTLEnd
540 0 : && !isCellStart( aStr, nCmpIdx + sSearchKey.getLength())) )
541 0 : continue;
542 :
543 0 : nSuchIdx = sSearchKey.getLength() - 1;
544 0 : while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == aStr[nCmpIdx + nSuchIdx])
545 : {
546 0 : if( nSuchIdx == 0 )
547 : {
548 0 : if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
549 : {
550 0 : sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
551 0 : bool bAtStart = !nCmpIdx;
552 0 : bool bAtEnd = nFndEnd == endPos;
553 0 : bool bDelimBefore = bAtStart || IsDelimiter( aStr, nCmpIdx-1 );
554 0 : bool bDelimBehind = IsDelimiter( aStr, nFndEnd );
555 : // * 1 -> only one word in the paragraph
556 : // * 2 -> at begin of paragraph
557 : // * 3 -> at end of paragraph
558 : // * 4 -> inside the paragraph
559 0 : if( !( ( bAtStart && bAtEnd ) || // 1
560 : ( bAtStart && bDelimBehind ) || // 2
561 : ( bAtEnd && bDelimBefore ) || // 3
562 0 : ( bDelimBefore && bDelimBehind ))) // 4
563 0 : break;
564 : }
565 :
566 0 : aRet.subRegExpressions = 1;
567 0 : aRet.startOffset.realloc( 1 );
568 0 : aRet.startOffset[ 0 ] = nCmpIdx;
569 0 : aRet.endOffset.realloc( 1 );
570 0 : aRet.endOffset[ 0 ] = nCmpIdx + sSearchKey.getLength();
571 :
572 : return aRet;
573 : }
574 : else
575 0 : nSuchIdx--;
576 : }
577 : }
578 0 : return aRet;
579 : }
580 :
581 0 : SearchResult TextSearch::NSrchBkwrd( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
582 : throw(RuntimeException)
583 : {
584 0 : SearchResult aRet;
585 0 : aRet.subRegExpressions = 0;
586 :
587 0 : OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;
588 :
589 0 : OUString aStr( searchStr );
590 0 : sal_Int32 nSuchIdx = aStr.getLength();
591 0 : sal_Int32 nEnde = endPos;
592 0 : if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
593 : return aRet;
594 :
595 0 : if (bUsePrimarySrchStr)
596 0 : MakeBackwardTab(); // create the jumptable
597 : else
598 0 : MakeBackwardTab2();
599 :
600 0 : if( nEnde == nSuchIdx ) // end position for the search
601 0 : nEnde = sSearchKey.getLength();
602 : else
603 0 : nEnde += sSearchKey.getLength();
604 :
605 0 : sal_Int32 nCmpIdx = startPos; // start position for the search
606 :
607 0 : while (nCmpIdx >= nEnde)
608 : {
609 : // if the match would be the completed cells, skip it.
610 0 : if ( (!checkCTLStart || isCellStart( aStr, nCmpIdx -
611 0 : sSearchKey.getLength() )) && (!checkCTLEnd ||
612 0 : isCellStart( aStr, nCmpIdx)))
613 : {
614 0 : nSuchIdx = 0;
615 0 : while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
616 0 : aStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
617 0 : nSuchIdx++;
618 0 : if( nSuchIdx >= sSearchKey.getLength() )
619 : {
620 0 : if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
621 : {
622 0 : sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
623 0 : bool bAtStart = !nFndStt;
624 0 : bool bAtEnd = nCmpIdx == startPos;
625 0 : bool bDelimBehind = IsDelimiter( aStr, nCmpIdx );
626 : bool bDelimBefore = bAtStart || // begin of paragraph
627 0 : IsDelimiter( aStr, nFndStt-1 );
628 : // * 1 -> only one word in the paragraph
629 : // * 2 -> at begin of paragraph
630 : // * 3 -> at end of paragraph
631 : // * 4 -> inside the paragraph
632 0 : if( ( bAtStart && bAtEnd ) || // 1
633 : ( bAtStart && bDelimBehind ) || // 2
634 : ( bAtEnd && bDelimBefore ) || // 3
635 : ( bDelimBefore && bDelimBehind )) // 4
636 : {
637 0 : aRet.subRegExpressions = 1;
638 0 : aRet.startOffset.realloc( 1 );
639 0 : aRet.startOffset[ 0 ] = nCmpIdx;
640 0 : aRet.endOffset.realloc( 1 );
641 0 : aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
642 : return aRet;
643 : }
644 : }
645 : else
646 : {
647 0 : aRet.subRegExpressions = 1;
648 0 : aRet.startOffset.realloc( 1 );
649 0 : aRet.startOffset[ 0 ] = nCmpIdx;
650 0 : aRet.endOffset.realloc( 1 );
651 0 : aRet.endOffset[ 0 ] = nCmpIdx - sSearchKey.getLength();
652 : return aRet;
653 : }
654 : }
655 : }
656 0 : nSuchIdx = GetDiff( aStr[nCmpIdx - sSearchKey.getLength()] );
657 0 : if( nCmpIdx < nSuchIdx )
658 : return aRet;
659 0 : nCmpIdx -= nSuchIdx;
660 : }
661 0 : return aRet;
662 : }
663 :
664 2 : void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions)
665 : {
666 : // select the transliterated pattern string
667 : const OUString& rPatternStr =
668 : (rOptions.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr
669 2 : : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString);
670 :
671 2 : sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
672 : // map com::sun::star::util::SearchFlags to ICU uregex.h flags
673 : // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
674 : // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
675 : // REG_NOSUB is not used anywhere => not implemented
676 : // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
677 : // LEV_RELAXED is only used for SearchAlgorithm==Approximate
678 : // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it???
679 2 : if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0)
680 2 : nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
681 2 : UErrorCode nIcuErr = U_ZERO_ERROR;
682 : // assumption: transliteration didn't mangle regexp control chars
683 2 : IcuUniString aIcuSearchPatStr( (const UChar*)rPatternStr.getStr(), rPatternStr.getLength());
684 : #ifndef DISABLE_WORDBOUND_EMULATION
685 : // for conveniance specific syntax elements of the old regex engine are emulated
686 : // by using regular word boundary matching \b to replace \< and \>
687 2 : static const IcuUniString aChevronPattern( "\\\\<|\\\\>", -1, IcuUniString::kInvariant);
688 2 : static const IcuUniString aChevronReplace( "\\\\b", -1, IcuUniString::kInvariant);
689 2 : static RegexMatcher aChevronMatcher( aChevronPattern, 0, nIcuErr);
690 2 : aChevronMatcher.reset( aIcuSearchPatStr);
691 2 : aIcuSearchPatStr = aChevronMatcher.replaceAll( aChevronReplace, nIcuErr);
692 2 : aChevronMatcher.reset();
693 : #endif
694 2 : pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
695 2 : if( nIcuErr)
696 0 : { delete pRegexMatcher; pRegexMatcher = NULL;}
697 2 : }
698 :
699 : //---------------------------------------------------------------------------
700 :
701 9 : SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
702 : sal_Int32 startPos, sal_Int32 endPos )
703 : throw(RuntimeException)
704 : {
705 9 : SearchResult aRet;
706 9 : aRet.subRegExpressions = 0;
707 9 : if( !pRegexMatcher)
708 : return aRet;
709 :
710 9 : if( endPos > searchStr.getLength())
711 0 : endPos = searchStr.getLength();
712 :
713 : // use the ICU RegexMatcher to find the matches
714 9 : UErrorCode nIcuErr = U_ZERO_ERROR;
715 9 : const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), endPos);
716 9 : pRegexMatcher->reset( aSearchTargetStr);
717 : // search until there is a valid match
718 0 : for(;;)
719 : {
720 9 : if( !pRegexMatcher->find( startPos, nIcuErr))
721 : return aRet;
722 :
723 : // #i118887# ignore zero-length matches e.g. "a*" in "bc"
724 2 : int nStartOfs = pRegexMatcher->start( nIcuErr);
725 2 : int nEndOfs = pRegexMatcher->end( nIcuErr);
726 2 : if( nStartOfs < nEndOfs)
727 2 : break;
728 : // try at next position if there was a zero-length match
729 0 : if( ++startPos >= endPos)
730 : return aRet;
731 : }
732 :
733 : // extract the result of the search
734 2 : const int nGroupCount = pRegexMatcher->groupCount();
735 2 : aRet.subRegExpressions = nGroupCount + 1;
736 2 : aRet.startOffset.realloc( aRet.subRegExpressions);
737 2 : aRet.endOffset.realloc( aRet.subRegExpressions);
738 2 : aRet.startOffset[0] = pRegexMatcher->start( nIcuErr);
739 2 : aRet.endOffset[0] = pRegexMatcher->end( nIcuErr);
740 4 : for( int i = 1; i <= nGroupCount; ++i) {
741 2 : aRet.startOffset[i] = pRegexMatcher->start( i, nIcuErr);
742 2 : aRet.endOffset[i] = pRegexMatcher->end( i, nIcuErr);
743 : }
744 :
745 9 : return aRet;
746 : }
747 :
748 1 : SearchResult TextSearch::RESrchBkwrd( const OUString& searchStr,
749 : sal_Int32 startPos, sal_Int32 endPos )
750 : throw(RuntimeException)
751 : {
752 : // NOTE: for backwards search callers provide startPos/endPos inverted!
753 1 : SearchResult aRet;
754 1 : aRet.subRegExpressions = 0;
755 1 : if( !pRegexMatcher)
756 : return aRet;
757 :
758 1 : if( startPos > searchStr.getLength())
759 0 : startPos = searchStr.getLength();
760 :
761 : // use the ICU RegexMatcher to find the matches
762 : // TODO: use ICU's backward searching once it becomes available
763 : // as its replacement using forward search is not as good as the real thing
764 1 : UErrorCode nIcuErr = U_ZERO_ERROR;
765 1 : const IcuUniString aSearchTargetStr( (const UChar*)searchStr.getStr(), startPos);
766 1 : pRegexMatcher->reset( aSearchTargetStr);
767 1 : if( !pRegexMatcher->find( endPos, nIcuErr))
768 : return aRet;
769 :
770 : // find the last match
771 1 : int nLastPos = 0;
772 3 : do {
773 3 : nLastPos = pRegexMatcher->start( nIcuErr);
774 3 : } while( pRegexMatcher->find( nLastPos + 1, nIcuErr));
775 :
776 : // find last match again to get its details
777 1 : pRegexMatcher->find( nLastPos, nIcuErr);
778 :
779 : // fill in the details of the last match
780 1 : const int nGroupCount = pRegexMatcher->groupCount();
781 1 : aRet.subRegExpressions = nGroupCount + 1;
782 1 : aRet.startOffset.realloc( aRet.subRegExpressions);
783 1 : aRet.endOffset.realloc( aRet.subRegExpressions);
784 : // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
785 1 : aRet.startOffset[0] = pRegexMatcher->end( nIcuErr);
786 1 : aRet.endOffset[0] = pRegexMatcher->start( nIcuErr);
787 3 : for( int i = 1; i <= nGroupCount; ++i) {
788 2 : aRet.startOffset[i] = pRegexMatcher->end( i, nIcuErr);
789 2 : aRet.endOffset[i] = pRegexMatcher->start( i, nIcuErr);
790 : }
791 :
792 1 : return aRet;
793 : }
794 :
795 : //---------------------------------------------------------------------------
796 :
797 : // search for words phonetically
798 0 : SearchResult TextSearch::ApproxSrchFrwrd( const OUString& searchStr,
799 : sal_Int32 startPos, sal_Int32 endPos )
800 : throw(RuntimeException)
801 : {
802 0 : SearchResult aRet;
803 0 : aRet.subRegExpressions = 0;
804 :
805 0 : if( !xBreak.is() )
806 : return aRet;
807 :
808 0 : OUString aWTemp( searchStr );
809 :
810 : register sal_Int32 nStt, nEnd;
811 :
812 0 : Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
813 : aSrchPara.Locale,
814 0 : WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
815 :
816 0 : do
817 : {
818 0 : if( aWBnd.startPos >= endPos )
819 0 : break;
820 0 : nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
821 0 : nEnd = aWBnd.endPos > endPos ? endPos : aWBnd.endPos;
822 :
823 0 : if( nStt < nEnd &&
824 0 : pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
825 : {
826 0 : aRet.subRegExpressions = 1;
827 0 : aRet.startOffset.realloc( 1 );
828 0 : aRet.startOffset[ 0 ] = nStt;
829 0 : aRet.endOffset.realloc( 1 );
830 0 : aRet.endOffset[ 0 ] = nEnd;
831 0 : break;
832 : }
833 :
834 0 : nStt = nEnd - 1;
835 0 : aWBnd = xBreak->nextWord( aWTemp, nStt, aSrchPara.Locale,
836 0 : WordType::ANYWORD_IGNOREWHITESPACES);
837 : } while( aWBnd.startPos != aWBnd.endPos ||
838 0 : (aWBnd.endPos != aWTemp.getLength() && aWBnd.endPos != nEnd) );
839 : // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
840 : // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
841 : // and nextWord() does also => don't loop forever.
842 0 : return aRet;
843 : }
844 :
845 0 : SearchResult TextSearch::ApproxSrchBkwrd( const OUString& searchStr,
846 : sal_Int32 startPos, sal_Int32 endPos )
847 : throw(RuntimeException)
848 : {
849 0 : SearchResult aRet;
850 0 : aRet.subRegExpressions = 0;
851 :
852 0 : if( !xBreak.is() )
853 : return aRet;
854 :
855 0 : OUString aWTemp( searchStr );
856 :
857 : register sal_Int32 nStt, nEnd;
858 :
859 0 : Boundary aWBnd = xBreak->getWordBoundary( aWTemp, startPos,
860 : aSrchPara.Locale,
861 0 : WordType::ANYWORD_IGNOREWHITESPACES, sal_True );
862 :
863 0 : do
864 : {
865 0 : if( aWBnd.endPos <= endPos )
866 0 : break;
867 0 : nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
868 0 : nEnd = aWBnd.endPos > startPos ? startPos : aWBnd.endPos;
869 :
870 0 : if( nStt < nEnd &&
871 0 : pWLD->WLD( aWTemp.getStr() + nStt, nEnd - nStt ) <= nLimit )
872 : {
873 0 : aRet.subRegExpressions = 1;
874 0 : aRet.startOffset.realloc( 1 );
875 0 : aRet.startOffset[ 0 ] = nEnd;
876 0 : aRet.endOffset.realloc( 1 );
877 0 : aRet.endOffset[ 0 ] = nStt;
878 0 : break;
879 : }
880 0 : if( !nStt )
881 0 : break;
882 :
883 0 : aWBnd = xBreak->previousWord( aWTemp, nStt, aSrchPara.Locale,
884 0 : WordType::ANYWORD_IGNOREWHITESPACES);
885 0 : } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != aWTemp.getLength() );
886 0 : return aRet;
887 : }
888 :
889 :
890 : static const sal_Char cSearchName[] = "com.sun.star.util.TextSearch";
891 : static const sal_Char cSearchImpl[] = "com.sun.star.util.TextSearch_i18n";
892 :
893 2 : static OUString getServiceName_Static()
894 : {
895 2 : return OUString::createFromAscii( cSearchName );
896 : }
897 :
898 2 : static OUString getImplementationName_Static()
899 : {
900 2 : return OUString::createFromAscii( cSearchImpl );
901 : }
902 :
903 : OUString SAL_CALL
904 0 : TextSearch::getImplementationName()
905 : throw( RuntimeException )
906 : {
907 0 : return getImplementationName_Static();
908 : }
909 :
910 : sal_Bool SAL_CALL
911 0 : TextSearch::supportsService(const OUString& rServiceName)
912 : throw( RuntimeException )
913 : {
914 0 : return rServiceName == cSearchName;
915 : }
916 :
917 : Sequence< OUString > SAL_CALL
918 0 : TextSearch::getSupportedServiceNames(void) throw( RuntimeException )
919 : {
920 0 : Sequence< OUString > aRet(1);
921 0 : aRet[0] = getServiceName_Static();
922 0 : return aRet;
923 : }
924 :
925 : ::com::sun::star::uno::Reference< ::com::sun::star::uno::XInterface >
926 3 : SAL_CALL TextSearch_CreateInstance(
927 : const ::com::sun::star::uno::Reference<
928 : ::com::sun::star::lang::XMultiServiceFactory >& rxMSF )
929 : {
930 : return ::com::sun::star::uno::Reference<
931 : ::com::sun::star::uno::XInterface >(
932 : (::cppu::OWeakObject*) new TextSearch(
933 3 : comphelper::getComponentContext( rxMSF ) ) );
934 : }
935 :
936 : extern "C"
937 : {
938 : SAL_DLLPUBLIC_EXPORT void* SAL_CALL
939 2 : i18nsearch_component_getFactory( const sal_Char* sImplementationName,
940 : void* _pServiceManager,
941 : SAL_UNUSED_PARAMETER void* )
942 : {
943 2 : void* pRet = NULL;
944 :
945 : ::com::sun::star::lang::XMultiServiceFactory* pServiceManager =
946 : reinterpret_cast< ::com::sun::star::lang::XMultiServiceFactory* >
947 2 : ( _pServiceManager );
948 : ::com::sun::star::uno::Reference<
949 2 : ::com::sun::star::lang::XSingleServiceFactory > xFactory;
950 :
951 2 : if ( 0 == rtl_str_compare( sImplementationName, cSearchImpl) )
952 : {
953 2 : ::com::sun::star::uno::Sequence< OUString > aServiceNames(1);
954 2 : aServiceNames[0] = getServiceName_Static();
955 : xFactory = ::cppu::createSingleFactory(
956 : pServiceManager, getImplementationName_Static(),
957 2 : &TextSearch_CreateInstance, aServiceNames );
958 : }
959 :
960 2 : if ( xFactory.is() )
961 : {
962 2 : xFactory->acquire();
963 2 : pRet = xFactory.get();
964 : }
965 :
966 2 : return pRet;
967 : }
968 :
969 6 : } // extern "C"
970 :
971 : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|