Branch data Line data Source code
1 : : /*************************************************************************
2 : : *
3 : : * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 : : *
5 : : * Copyright 2000, 2010 Oracle and/or its affiliates.
6 : : *
7 : : * OpenOffice.org - a multi-platform office productivity suite
8 : : *
9 : : * This file is part of OpenOffice.org.
10 : : *
11 : : * OpenOffice.org is free software: you can redistribute it and/or modify
12 : : * it under the terms of the GNU Lesser General Public License version 3
13 : : * only, as published by the Free Software Foundation.
14 : : *
15 : : * OpenOffice.org is distributed in the hope that it will be useful,
16 : : * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 : : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 : : * GNU Lesser General Public License version 3 for more details
19 : : * (a copy is included in the LICENSE file that accompanied this code).
20 : : *
21 : : * You should have received a copy of the GNU Lesser General Public License
22 : : * version 3 along with OpenOffice.org. If not, see
23 : : * <http://www.openoffice.org/license.html>
24 : : * for a copy of the LGPLv3 License.
25 : : *
26 : : ************************************************************************/
27 : :
28 : : /* Extended regular expression matching and search library,
29 : : version 0.12.
30 : : (Implements POSIX draft P1003.2/D11.2, except for some of the
31 : : internationalization features.)
32 : : Copyright (C) 1993, 94, 95, 96, 97, 98, 99 Free Software Foundation, Inc.
33 : :
34 : : The GNU C Library is free software; you can redistribute it and/or
35 : : modify it under the terms of the GNU Library General Public License as
36 : : published by the Free Software Foundation; either version 2 of the
37 : : License, or (at your option) any later version.
38 : :
39 : : The GNU C Library is distributed in the hope that it will be useful,
40 : : but WITHOUT ANY WARRANTY; without even the implied warranty of
41 : : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
42 : : Library General Public License for more details.
43 : :
44 : : You should have received a copy of the GNU Library General Public
45 : : License along with the GNU C Library; see the file COPYING.LIB. If not,
46 : : write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
47 : : Boston, MA 02111-1307, USA. */
48 : :
49 : : /*
50 : : Modified for OpenOffice.org to use sal_Unicode and Transliteration service.
51 : : */
52 : :
53 : : #include <regexp/reclass.hxx>
54 : :
55 : : #if 0
56 : : /* If for any reason (porting, debug) we can't use alloca() use malloc()
57 : : instead. Use alloca() if possible for performance reasons, this _is_
58 : : significant, with malloc() the re_match2() method makes heavy use of regexps
59 : : through the TextSearch interface up to three times slower. This is _the_
60 : : bottleneck in some spreadsheet documents. */
61 : : #define REGEX_MALLOC
62 : : #endif
63 : :
64 : : /* AIX requires this to be the first thing in the file. */
65 : : #if defined _AIX && !defined REGEX_MALLOC
66 : : #pragma alloca
67 : : #endif
68 : :
69 : : #include <string.h>
70 : : #include <assert.h>
71 : :
72 : : #include <rtl/ustring.hxx>
73 : : #include <com/sun/star/i18n/TransliterationModules.hpp>
74 : :
75 : : /* Maximum number of duplicates an interval can allow. Some systems
76 : : (erroneously) define this in other header files, but we want our
77 : : value, so remove any previous define. */
78 : : #ifdef RE_DUP_MAX
79 : : # undef RE_DUP_MAX
80 : : #endif
81 : : /* If sizeof(int) == 2, then ((1 << 15) - 1) overflows. */
82 : : #define RE_DUP_MAX (0x7fff)
83 : :
84 : :
85 : : /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
86 : : `re_match_2' returns information about at least this many registers
87 : : the first time a `regs' structure is passed. */
88 : : #ifndef RE_NREGS
89 : : # define RE_NREGS 30
90 : : #endif
91 : :
92 : :
93 : : // Macros
94 : : #define INIT_COMPILE_STACK_SIZE 32
95 : : #define INIT_BUF_SIZE ((1 << BYTEWIDTH)/BYTEWIDTH)
96 : : #define MAX_BUF_SIZE 65535L
97 : : #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
98 : : #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
99 : :
100 : : /* Since we have one byte reserved for the register number argument to
101 : : {start,stop}_memory, the maximum number of groups we can report
102 : : things about is what fits in that byte. */
103 : : #define MAX_REGNUM 255
104 : :
105 : : #define MIN(x, y) ( (x) < (y) ? (x) : (y) )
106 : : #define MAX(x, y) ( (x) > (y) ? (x) : (y) )
107 : :
108 : :
109 : : // Always. We're not in Emacs and don't use relocating allocators.
110 : : #define MATCH_MAY_ALLOCATE
111 : :
112 : : /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
113 : : use `alloca' instead of `malloc'. This is because malloc is slower and
114 : : causes storage fragmentation. On the other hand, malloc is more portable,
115 : : and easier to debug.
116 : :
117 : : Because we sometimes use alloca, some routines have to be macros,
118 : : not functions -- `alloca'-allocated space disappears at the end of the
119 : : function it is called in. */
120 : :
121 : : #ifdef REGEX_MALLOC
122 : :
123 : : # define REGEX_ALLOCATE malloc
124 : : # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
125 : : # define REGEX_FREE free
126 : :
127 : : #else /* not REGEX_MALLOC */
128 : :
129 : : /* Emacs already defines alloca, sometimes. So does MSDEV. */
130 : : # ifndef alloca
131 : :
132 : : /* Make alloca work the best possible way. */
133 : : # ifdef __GNUC__
134 : : # define alloca __builtin_alloca
135 : : # else /* not __GNUC__ */
136 : : # include <sal/alloca.h>
137 : : # endif /* not __GNUC__ */
138 : :
139 : : # endif /* not alloca */
140 : :
141 : : # define REGEX_ALLOCATE alloca
142 : :
143 : : /* Assumes a `char *destination' variable. */
144 : : # define REGEX_REALLOCATE(source, osize, nsize) \
145 : : (destination = (char *) alloca (nsize), \
146 : : memcpy (destination, source, osize))
147 : :
148 : : /* No need to do anything to free, after alloca. */
149 : : # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
150 : :
151 : : #endif /* not REGEX_MALLOC */
152 : :
153 : :
154 : : /* Define how to allocate the failure stack. */
155 : :
156 : : #ifdef REGEX_MALLOC
157 : :
158 : : # define REGEX_ALLOCATE_STACK malloc
159 : : # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
160 : : # define REGEX_FREE_STACK free
161 : :
162 : : #else /* not REGEX_MALLOC */
163 : :
164 : : # define REGEX_ALLOCATE_STACK alloca
165 : :
166 : : # define REGEX_REALLOCATE_STACK(source, osize, nsize) \
167 : : REGEX_REALLOCATE (source, osize, nsize)
168 : : /* No need to explicitly free anything. */
169 : : # define REGEX_FREE_STACK(arg)
170 : :
171 : : #endif /* not REGEX_MALLOC */
172 : :
173 : :
174 : : /* (Re)Allocate N items of type T using malloc, or fail. */
175 : : #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
176 : : #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
177 : : #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
178 : :
179 : : #define BYTEWIDTH 16 /* In bits (assuming sizeof(sal_Unicode)*8) */
180 : :
181 : :
182 : : #define CHAR_CLASS_MAX_LENGTH 256
183 : :
184 : : /* Fetch the next character in the uncompiled pattern, with no
185 : : translation. */
186 : : #define PATFETCH_RAW(c) \
187 : : do { \
188 : : if (p == pend) return REG_EEND; \
189 : : c = (sal_Unicode) *p++; \
190 : : } while (0)
191 : :
192 : : /* Go backwards one character in the pattern. */
193 : : #define PATUNFETCH p--
194 : :
195 : : #define FREE_STACK_RETURN(value) \
196 : : return(free(compile_stack.stack), value)
197 : :
198 : : #define GET_BUFFER_SPACE(n) \
199 : : while ((sal_uInt32)(b - bufp->buffer + (n)) > bufp->allocated) \
200 : : EXTEND_BUFFER()
201 : :
202 : : /* Extend the buffer by twice its current size via realloc and
203 : : reset the pointers that pointed into the old block to point to the
204 : : correct places in the new one. If extending the buffer results in it
205 : : being larger than MAX_BUF_SIZE, then flag memory exhausted. */
206 : : #define EXTEND_BUFFER() \
207 : : do { \
208 : : sal_Unicode *old_buffer = bufp->buffer; \
209 : : if (bufp->allocated == MAX_BUF_SIZE) \
210 : : return REG_ESIZE; \
211 : : bufp->allocated <<= 1; \
212 : : if (bufp->allocated > MAX_BUF_SIZE) \
213 : : bufp->allocated = MAX_BUF_SIZE; \
214 : : bufp->buffer = (sal_Unicode *) realloc(bufp->buffer, \
215 : : bufp->allocated * \
216 : : sizeof(sal_Unicode)); \
217 : : if (bufp->buffer == NULL) \
218 : : return REG_ESPACE; \
219 : : /* If the buffer moved, move all the pointers into it. */ \
220 : : if (old_buffer != bufp->buffer) { \
221 : : b = (b - old_buffer) + bufp->buffer; \
222 : : begalt = (begalt - old_buffer) + bufp->buffer; \
223 : : if (fixup_alt_jump) \
224 : : fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
225 : : if (laststart) \
226 : : laststart = (laststart - old_buffer) + bufp->buffer; \
227 : : if (pending_exact) \
228 : : pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
229 : : } \
230 : : } while (0)
231 : :
232 : : #define BUF_PUSH(c) \
233 : : do { \
234 : : GET_BUFFER_SPACE(1); \
235 : : *b++ = (sal_Unicode)(c); \
236 : : } while(0)
237 : :
238 : : /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
239 : : #define BUF_PUSH_2(c1, c2) \
240 : : do { \
241 : : GET_BUFFER_SPACE(2); \
242 : : *b++ = (sal_Unicode) (c1); \
243 : : *b++ = (sal_Unicode) (c2); \
244 : : } while (0)
245 : :
246 : : /* As with BUF_PUSH_2, except for three bytes. */
247 : : #define BUF_PUSH_3(c1, c2, c3) \
248 : : do { \
249 : : GET_BUFFER_SPACE(3); \
250 : : *b++ = (sal_Unicode) (c1); \
251 : : *b++ = (sal_Unicode) (c2); \
252 : : *b++ = (sal_Unicode) (c3); \
253 : : } while (0)
254 : :
255 : : /* Store a jump with opcode OP at LOC to location TO. We store a
256 : : relative address offset by the three bytes the jump itself occupies. */
257 : : #define STORE_JUMP(op, loc, to) \
258 : : store_op1(op, loc, (int) ((to) - (loc) - 3))
259 : :
260 : : /* Likewise, for a two-argument jump. */
261 : : #define STORE_JUMP2(op, loc, to, arg) \
262 : : store_op2(op, loc, (int) ((to) - (loc) - 3), arg)
263 : :
264 : : /* Store NUMBER in two contiguous sal_Unicode starting at DESTINATION. */
265 : :
266 : : inline
267 : : void
268 : 0 : Regexpr::store_number( sal_Unicode * destination, sal_Int32 number )
269 : : {
270 : 0 : (destination)[0] = sal_Unicode((number) & 0xffff);
271 : 0 : (destination)[1] = sal_Unicode((number) >> 16);
272 : 0 : }
273 : :
274 : : /* Same as STORE_NUMBER, except increment DESTINATION to
275 : : the byte after where the number is stored. Therefore, DESTINATION
276 : : must be an lvalue. */
277 : :
278 : : inline
279 : : void
280 : 0 : Regexpr::store_number_and_incr( sal_Unicode *& destination, sal_Int32 number )
281 : : {
282 : 0 : store_number( destination, number );
283 : 0 : (destination) += 2;
284 : 0 : }
285 : :
286 : : /* Put into DESTINATION a number stored in two contiguous sal_Unicode starting
287 : : at SOURCE. */
288 : :
289 : 0 : inline void Regexpr::extract_number( sal_Int32 & dest, sal_Unicode *source )
290 : : {
291 : 0 : dest = (((sal_Int32) source[1]) << 16) | (source[0] & 0xffff);
292 : 0 : }
293 : :
294 : : /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
295 : : #define INSERT_JUMP(op, loc, to) \
296 : : insert_op1(op, loc, (sal_Int32) ((to) - (loc) - 3), b)
297 : :
298 : : /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
299 : : #define INSERT_JUMP2(op, loc, to, arg) \
300 : : insert_op2(op, loc, (sal_Int32) ((to) - (loc) - 3), arg, b)
301 : :
302 : : #define STREQ(s1, s2) (rtl_ustr_compare((s1), (s2)) ? (0) : (1))
303 : :
304 : : #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
305 : : #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
306 : :
307 : : /* The next available element. */
308 : : #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
309 : :
310 : : /* Get the next unsigned number in the uncompiled pattern. */
311 : : #define GET_UNSIGNED_NUMBER(num) { \
312 : : if (p != pend) { \
313 : : PATFETCH_RAW(c); \
314 : : while (c >= (sal_Unicode)'0' && c <= (sal_Unicode)'9') { \
315 : : if (num < 0) \
316 : : num = 0; \
317 : : num = num * 10 + c - (sal_Unicode)'0'; \
318 : : if (p == pend) \
319 : : break; \
320 : : PATFETCH_RAW(c); \
321 : : } \
322 : : } \
323 : : }
324 : :
325 : : /* Get the next hex number in the uncompiled pattern. */
326 : : #define GET_HEX_NUMBER(num) { \
327 : : if (p != pend) { \
328 : : sal_Bool stop = false; \
329 : : sal_Int16 hexcnt = 1; \
330 : : PATFETCH_RAW(c); \
331 : : while ( (c >= (sal_Unicode)'0' && c <= (sal_Unicode)'9') || (c >= (sal_Unicode)'a' && c <= (sal_Unicode)'f') || (c >= (sal_Unicode)'A' && c <= (sal_Unicode)'F') ) { \
332 : : if (num < 0) \
333 : : num = 0; \
334 : : if ( c >= (sal_Unicode)'0' && c <= (sal_Unicode)'9' ) \
335 : : num = num * 16 + c - (sal_Unicode)'0'; \
336 : : else if ( c >= (sal_Unicode)'a' && c <= (sal_Unicode)'f' ) \
337 : : num = num * 16 + (10 + c - (sal_Unicode)'a'); \
338 : : else \
339 : : num = num * 16 + (10 + c - (sal_Unicode)'A'); \
340 : : if (p == pend || hexcnt == 4) { \
341 : : stop = true; \
342 : : break; \
343 : : } \
344 : : PATFETCH_RAW(c); \
345 : : hexcnt++; \
346 : : } \
347 : : \
348 : : if ( ! stop ) { \
349 : : PATUNFETCH; \
350 : : hexcnt--; \
351 : : } \
352 : : if ( hexcnt > 4 || (num < 0 || num > 0xffff) ) num = -1;\
353 : : } \
354 : : }
355 : :
356 : :
357 : : /* Number of failure points for which to initially allocate space
358 : : when matching. If this number is exceeded, we allocate more
359 : : space, so it is not a hard limit. */
360 : : #ifndef INIT_FAILURE_ALLOC
361 : : # define INIT_FAILURE_ALLOC 5
362 : : #endif
363 : :
364 : : #define INIT_FAIL_STACK() \
365 : : do { \
366 : : fail_stack.stack = (fail_stack_elt_t *) \
367 : : REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
368 : : \
369 : : if (fail_stack.stack == NULL) \
370 : : return -2; \
371 : : \
372 : : fail_stack.size = INIT_FAILURE_ALLOC; \
373 : : fail_stack.avail = 0; \
374 : : } while (0)
375 : :
376 : : #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
377 : :
378 : : /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
379 : :
380 : : Return 1 if succeeds, and 0 if either ran out of memory
381 : : allocating space for it or it was already too large.
382 : :
383 : : REGEX_REALLOCATE_STACK requires `destination' be declared. */
384 : :
385 : : #define DOUBLE_FAIL_STACK(fail_stack) \
386 : : ((fail_stack).size > (sal_uInt32) (re_max_failures * MAX_FAILURE_ITEMS) \
387 : : ? 0 \
388 : : : ((fail_stack).stack = (fail_stack_elt_t *) \
389 : : REGEX_REALLOCATE_STACK ((fail_stack).stack, \
390 : : (fail_stack).size * sizeof (fail_stack_elt_t), \
391 : : ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
392 : : \
393 : : (fail_stack).stack == NULL \
394 : : ? 0 \
395 : : : ((fail_stack).size <<= 1, \
396 : : 1)))
397 : :
398 : :
399 : : #define REG_UNSET_VALUE (®_unset_dummy)
400 : : #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
401 : :
402 : : #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
403 : : #define IS_ACTIVE(R) ((R).bits.is_active)
404 : : #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
405 : : #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
406 : :
407 : : /* Call this when have matched a real character; it sets `matched' flags
408 : : for the subexpressions which we are currently inside. Also records
409 : : that those subexprs have matched. */
410 : : #define SET_REGS_MATCHED() \
411 : : do { \
412 : : if (!set_regs_matched_done) { \
413 : : sal_uInt32 r; \
414 : : set_regs_matched_done = 1; \
415 : : for (r = lowest_active_reg; r <= highest_active_reg; r++) { \
416 : : MATCHED_SOMETHING(reg_info[r]) \
417 : : = EVER_MATCHED_SOMETHING(reg_info[r]) \
418 : : = 1; \
419 : : } \
420 : : } \
421 : : } \
422 : : while (0)
423 : :
424 : : #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
425 : :
426 : : /* This converts PTR, a pointer into the search string `string2' into an offset from the beginning of that string. */
427 : : #define POINTER_TO_OFFSET(ptr) ((sal_Int32) ((ptr) - string2))
428 : :
429 : : /* This is the number of items that are pushed and popped on the stack
430 : : for each register. */
431 : : #define NUM_REG_ITEMS 3
432 : :
433 : : /* Individual items aside from the registers. */
434 : : # define NUM_NONREG_ITEMS 4
435 : :
436 : : /* We push at most this many items on the stack. */
437 : : /* We used to use (num_regs - 1), which is the number of registers
438 : : this regexp will save; but that was changed to 5
439 : : to avoid stack overflow for a regexp with lots of parens. */
440 : : #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
441 : :
442 : : /* We actually push this many items. */
443 : : #define NUM_FAILURE_ITEMS \
444 : : (((0 \
445 : : ? 0 : highest_active_reg - lowest_active_reg + 1) \
446 : : * NUM_REG_ITEMS) \
447 : : + NUM_NONREG_ITEMS)
448 : :
449 : : /* How many items can still be added to the stack without overflowing it. */
450 : : #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
451 : :
452 : : /* Push a pointer value onto the failure stack.
453 : : Assumes the variable `fail_stack'. Probably should only
454 : : be called from within `PUSH_FAILURE_POINT'. */
455 : : #define PUSH_FAILURE_POINTER(item) \
456 : : fail_stack.stack[fail_stack.avail++].pointer = (sal_Unicode *) (item)
457 : :
458 : : /* This pushes an integer-valued item onto the failure stack.
459 : : Assumes the variable `fail_stack'. Probably should only
460 : : be called from within `PUSH_FAILURE_POINT'. */
461 : : #define PUSH_FAILURE_INT(item) \
462 : : fail_stack.stack[fail_stack.avail++].integer = (item)
463 : :
464 : : /* Push a fail_stack_elt_t value onto the failure stack.
465 : : Assumes the variable `fail_stack'. Probably should only
466 : : be called from within `PUSH_FAILURE_POINT'. */
467 : : #define PUSH_FAILURE_ELT(item) \
468 : : fail_stack.stack[fail_stack.avail++] = (item)
469 : :
470 : : /* These three POP... operations complement the three PUSH... operations.
471 : : All assume that `fail_stack' is nonempty. */
472 : : #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
473 : : #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
474 : : #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
475 : :
476 : : /* Test if at very beginning or at very end of `string2'. */
477 : : #define AT_STRINGS_BEG(d) ((d) == string2 || !size2)
478 : : #define AT_STRINGS_END(d) ((d) == end2)
479 : :
480 : : /* Checking for end of string */
481 : : #define PREFETCH() \
482 : : do { \
483 : : if ( d == end2 ) { \
484 : : goto fail; \
485 : : } \
486 : : } while (0)
487 : :
488 : :
489 : : sal_Bool
490 : 0 : Regexpr::iswordbegin(const sal_Unicode *d, sal_Unicode *string, sal_Int32 ssize)
491 : : {
492 [ # # ][ # # ]: 0 : if ( d == string || ! ssize ) return true;
493 : :
494 [ # # ][ # # ]: 0 : if ( !unicode::isAlphaDigit(d[-1]) && unicode::isAlphaDigit(d[0])) {
[ # # ]
495 : 0 : return true;
496 : : }
497 : 0 : return false;
498 : : }
499 : :
500 : : sal_Bool
501 : 0 : Regexpr::iswordend(const sal_Unicode *d, sal_Unicode *string, sal_Int32 ssize)
502 : : {
503 [ # # ]: 0 : if ( d == (string+ssize) ) return true;
504 : :
505 [ # # ][ # # ]: 0 : if ( !unicode::isAlphaDigit(d[0]) && unicode::isAlphaDigit(d[-1])) {
[ # # ]
506 : 0 : return true;
507 : : }
508 : 0 : return false;
509 : : }
510 : :
511 : : /* Push the information about the state we will need
512 : : if we ever fail back to it.
513 : :
514 : : Requires variables fail_stack, regstart, regend, and reg_info
515 : : be declared. DOUBLE_FAIL_STACK requires `destination'
516 : : be declared.
517 : :
518 : : Does `return FAILURE_CODE' if runs out of memory. */
519 : :
520 : : #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
521 : : do { \
522 : : char *destination; \
523 : : /* Must be int, so when we don't save any registers, the arithmetic \
524 : : of 0 + -1 isn't done as unsigned. */ \
525 : : /* Can't be int, since there is not a shred of a guarantee that int \
526 : : is wide enough to hold a value of something to which pointer can \
527 : : be assigned */ \
528 : : sal_uInt32 this_reg; \
529 : : \
530 : : /* Ensure we have enough space allocated for what we will push. */ \
531 : : while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) { \
532 : : if (!DOUBLE_FAIL_STACK(fail_stack)) \
533 : : return failure_code; \
534 : : } \
535 : : \
536 : : /* Push the info, starting with the registers. */ \
537 : : if (1) \
538 : : for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
539 : : this_reg++) { \
540 : : PUSH_FAILURE_POINTER(regstart[this_reg]); \
541 : : \
542 : : PUSH_FAILURE_POINTER (regend[this_reg]); \
543 : : \
544 : : PUSH_FAILURE_ELT(reg_info[this_reg].word); \
545 : : } \
546 : : \
547 : : PUSH_FAILURE_INT(lowest_active_reg); \
548 : : \
549 : : PUSH_FAILURE_INT(highest_active_reg); \
550 : : \
551 : : PUSH_FAILURE_POINTER(pattern_place); \
552 : : \
553 : : PUSH_FAILURE_POINTER(string_place); \
554 : : \
555 : : } while (0)
556 : :
557 : : /* Pops what PUSH_FAIL_STACK pushes.
558 : :
559 : : We restore into the parameters, all of which should be lvalues:
560 : : STR -- the saved data position.
561 : : PAT -- the saved pattern position.
562 : : LOW_REG, HIGH_REG -- the highest and lowest active registers.
563 : : REGSTART, REGEND -- arrays of string positions.
564 : : REG_INFO -- array of information about each subexpression.
565 : :
566 : : Also assumes the variables `fail_stack' and (if debugging), `bufp',
567 : : `pend', `string2', and `size2'. */
568 : :
569 : : #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info) {\
570 : : sal_uInt32 this_reg; \
571 : : sal_Unicode *string_temp; \
572 : : \
573 : : assert(!FAIL_STACK_EMPTY()); \
574 : : \
575 : : /* Remove failure points and point to how many regs pushed. */ \
576 : : assert(fail_stack.avail >= NUM_NONREG_ITEMS); \
577 : : \
578 : : /* If the saved string location is NULL, it came from an \
579 : : on_failure_keep_string_jump opcode, and we want to throw away the \
580 : : saved NULL, thus retaining our current position in the string. */ \
581 : : string_temp = POP_FAILURE_POINTER(); \
582 : : if (string_temp != NULL) \
583 : : str = (const sal_Unicode *) string_temp; \
584 : : \
585 : : pat = (sal_Unicode *) POP_FAILURE_POINTER(); \
586 : : \
587 : : /* Restore register info. */ \
588 : : high_reg = (sal_uInt32) POP_FAILURE_INT(); \
589 : : \
590 : : low_reg = (sal_uInt32) POP_FAILURE_INT(); \
591 : : \
592 : : if (1) \
593 : : for (this_reg = high_reg; this_reg >= low_reg; this_reg--) { \
594 : : \
595 : : reg_info[this_reg].word = POP_FAILURE_ELT(); \
596 : : \
597 : : regend[this_reg] = (const sal_Unicode *) POP_FAILURE_POINTER(); \
598 : : \
599 : : regstart[this_reg] = (const sal_Unicode *) POP_FAILURE_POINTER(); \
600 : : } else { \
601 : : for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) {\
602 : : reg_info[this_reg].word.integer = 0; \
603 : : regend[this_reg] = 0; \
604 : : regstart[this_reg] = 0; \
605 : : } \
606 : : highest_active_reg = high_reg; \
607 : : } \
608 : : \
609 : : set_regs_matched_done = 0; \
610 : : } /* POP_FAILURE_POINT */
611 : :
612 : : inline
613 : : void
614 : 0 : Regexpr::extract_number_and_incr( sal_Int32 & destination, sal_Unicode *& source )
615 : : {
616 : 0 : extract_number(destination, source);
617 : 0 : source += 2;
618 : 0 : }
619 : :
620 : :
621 : : inline
622 : : void
623 : 0 : Regexpr::store_op1(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg)
624 : : {
625 : 0 : *loc = (sal_Unicode) op;
626 : 0 : store_number(loc + 1, arg);
627 : 0 : }
628 : :
629 : : /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
630 : :
631 : : inline
632 : : void
633 : 0 : Regexpr::store_op2(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg1, sal_Int32 arg2)
634 : : {
635 : 0 : *loc = (sal_Unicode) op;
636 : 0 : store_number(loc + 1, arg1);
637 : 0 : store_number(loc + 3, arg2);
638 : 0 : }
639 : :
640 : : void
641 : 0 : Regexpr::insert_op1(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg, sal_Unicode *end)
642 : : {
643 : 0 : register sal_Unicode *pfrom = end;
644 : 0 : register sal_Unicode *pto = end + 3;
645 : :
646 [ # # ]: 0 : while (pfrom != loc) {
647 : 0 : *--pto = *--pfrom;
648 : : }
649 : :
650 : 0 : store_op1(op, loc, arg);
651 : 0 : }
652 : :
653 : :
654 : : /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
655 : :
656 : : void
657 : 0 : Regexpr::insert_op2(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg1, sal_Int32 arg2, sal_Unicode *end)
658 : : {
659 : 0 : register sal_Unicode *pfrom = end;
660 : 0 : register sal_Unicode *pto = end + 5;
661 : :
662 [ # # ]: 0 : while (pfrom != loc)
663 : 0 : *--pto = *--pfrom;
664 : :
665 : 0 : store_op2 (op, loc, arg1, arg2);
666 : 0 : }
667 : :
668 : : /* P points to just after a ^ in PATTERN. Return true if that ^ comes
669 : : after an alternative or a begin-subexpression. We assume there is at
670 : : least one character before the ^. */
671 : :
672 : : sal_Bool
673 : 0 : Regexpr::at_begline_loc_p(const sal_Unicode *local_pattern, const sal_Unicode *p)
674 : : {
675 : 0 : const sal_Unicode *prev = p - 2;
676 [ # # ][ # # ]: 0 : sal_Bool prev_prev_backslash = prev > local_pattern && prev[-1] == '\\';
677 : :
678 : : return(
679 : : /* After a subexpression? */
680 : : (*prev == (sal_Unicode)'(' && prev_prev_backslash)
681 : : /* After an alternative? */
682 [ # # ][ # # ]: 0 : || (*prev == (sal_Unicode)'|' && prev_prev_backslash));
[ # # ][ # # ]
683 : : }
684 : :
685 : : /* The dual of at_begline_loc_p. This one is for $. We assume there is
686 : : at least one character after the $, i.e., `P < PEND'. */
687 : :
688 : : sal_Bool
689 : 0 : Regexpr::at_endline_loc_p(const sal_Unicode *p)
690 : : {
691 : 0 : const sal_Unicode *next = p;
692 : : //sal_Bool next_backslash = *next == (sal_Unicode)'\\';
693 : : //const sal_Unicode *next_next = p + 1 < pend ? p + 1 : 0;
694 : :
695 : : return(
696 : : /* Before a subexpression? */
697 : : *next == (sal_Unicode)')'
698 : : // (next_backslash && next_next && *next_next == (sal_Unicode)')')
699 : : /* Before an alternative? */
700 [ # # ][ # # ]: 0 : || *next == (sal_Unicode)'|' );
701 : : // || (next_backslash && next_next && *next_next == (sal_Unicode)'|'));
702 : : }
703 : :
704 : : reg_errcode_t
705 : 0 : Regexpr::compile_range(sal_Unicode range_start, sal_Unicode range_end, sal_Unicode *b)
706 : : {
707 : : sal_uInt32 this_char;
708 : :
709 : : /* If the start is after the end, the range is empty. */
710 [ # # ]: 0 : if (range_start > range_end)
711 : 0 : return REG_NOERROR;
712 : :
713 : : /* Here we see why `this_char' has to be larger than an `sal_Unicode'
714 : : -- the range is inclusive, so if `range_end' == 0xffff
715 : : (assuming 16-bit characters), we would otherwise go into an infinite
716 : : loop, since all characters <= 0xffff. */
717 [ # # ]: 0 : for (this_char = range_start; this_char <= range_end; this_char++) {
718 : 0 : set_list_bit( sal_Unicode(this_char), b);
719 : : }
720 : :
721 : 0 : return REG_NOERROR;
722 : : }
723 : :
724 : : /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
725 : : false if it's not. */
726 : :
727 : : sal_Bool
728 : 0 : Regexpr::group_in_compile_stack(compile_stack_type compile_stack, sal_uInt32 regnum)
729 : : {
730 : : sal_Int32 this_element;
731 : :
732 [ # # ]: 0 : for (this_element = compile_stack.avail - 1;
733 : : this_element >= 0;
734 : : this_element--) {
735 [ # # ]: 0 : if (compile_stack.stack[this_element].regnum == regnum) {
736 : 0 : return true;
737 : : }
738 : : }
739 : :
740 : 0 : return false;
741 : : }
742 : :
743 : :
744 : 0 : Regexpr::Regexpr( const ::com::sun::star::util::SearchOptions & rOptions,
745 : : ::com::sun::star::uno::Reference<
746 : 0 : ::com::sun::star::i18n::XExtendedTransliteration > XTrans)
747 : : {
748 : 0 : bufp = NULL;
749 : 0 : pattern = NULL;
750 : :
751 [ # # ]: 0 : if ( rOptions.algorithmType != ::com::sun::star::util::SearchAlgorithms_REGEXP ) {
752 : 0 : return;
753 : : }
754 : :
755 [ # # # # ]: 0 : if ( rOptions.searchString == NULL ||
[ # # ]
756 : 0 : rOptions.searchString.isEmpty()) {
757 : 0 : return;
758 : : }
759 : :
760 : 0 : pattern = (sal_Unicode *)rOptions.searchString.getStr();
761 : 0 : patsize = rOptions.searchString.getLength();
762 : :
763 : 0 : re_max_failures = 2000;
764 : :
765 [ # # ]: 0 : translit = XTrans;
766 [ # # ]: 0 : translate = translit.is() ? 1 : 0;
767 : :
768 : 0 : bufp = NULL;
769 : :
770 : : isIgnoreCase = ((rOptions.transliterateFlags &
771 : 0 : ::com::sun::star::i18n::TransliterationModules_IGNORE_CASE) != 0);
772 : :
773 : : // Compile Regular expression pattern
774 [ # # ][ # # ]: 0 : if ( regcomp() != REG_NOERROR )
775 : : {
776 [ # # ]: 0 : if ( bufp )
777 : : {
778 [ # # ]: 0 : if ( bufp->buffer )
779 : 0 : free(bufp->buffer);
780 [ # # ]: 0 : if( bufp->fastmap )
781 : 0 : free(bufp->fastmap);
782 : :
783 : 0 : free(bufp);
784 : 0 : bufp = NULL;
785 : : }
786 : : }
787 : : }
788 : :
789 : 0 : Regexpr::~Regexpr()
790 : : {
791 : : // translit->remove();
792 [ # # ]: 0 : if( bufp )
793 : : {
794 [ # # ]: 0 : if( bufp->buffer )
795 : 0 : free(bufp->buffer);
796 [ # # ]: 0 : if( bufp->fastmap )
797 : 0 : free(bufp->fastmap);
798 : :
799 : 0 : free(bufp);
800 : 0 : bufp = NULL;
801 : : }
802 : :
803 : 0 : }
804 : :
805 : : // sets a new line to search in (restore start/end_ptr)
806 : : void
807 : 0 : Regexpr::set_line(const sal_Unicode *new_line, sal_Int32 len)
808 : : {
809 : 0 : line = new_line;
810 : 0 : linelen = len;
811 : 0 : }
812 : :
813 : : // main function for searching the pattern
814 : : // returns negative or startpos and sets regs
815 : : sal_Int32
816 : 0 : Regexpr::re_search(struct re_registers *regs, sal_Int32 pOffset)
817 : : {
818 : : // Check if pattern buffer is NULL
819 [ # # ]: 0 : if ( bufp == NULL ) {
820 : 0 : return(-3);
821 : : }
822 : :
823 : : sal_Int32 range;
824 : : sal_Int32 startpos;
825 : : sal_Int32 stoppos;
826 : :
827 : 0 : startpos = pOffset;
828 [ # # ]: 0 : if ( linelen < 0 ) {
829 : 0 : range = linelen + 1;
830 : 0 : linelen = -(linelen);
831 : 0 : stoppos = pOffset + 1;
832 : : } else {
833 : 0 : range = linelen - 1;
834 : 0 : stoppos = linelen;
835 : : }
836 : 0 : for ( ; ; ) {
837 : 0 : sal_Int32 val = re_match2(regs, startpos, stoppos);
838 : :
839 : : #ifndef REGEX_MALLOC
840 : : # ifdef C_ALLOCA
841 : : alloca (0);
842 : : # endif
843 : : #endif
844 : :
845 : : // Return success if match found
846 [ # # ]: 0 : if (val == 0) {
847 : 0 : break;
848 : : }
849 : :
850 [ # # ]: 0 : if (val == -2) {
851 : 0 : return(-2);
852 : : }
853 : :
854 : : // If match only beginning of string (startpos)
855 [ # # ]: 0 : if (!range) {
856 : 0 : break;
857 : : }
858 : :
859 : : // If search match from startpos to startpos+range
860 [ # # ]: 0 : else if (range > 0) { // Forward string search
861 : 0 : range--;
862 : 0 : startpos++;
863 : : } else { // Reverse string search
864 : 0 : range++;
865 : 0 : startpos--;
866 : : }
867 : : }
868 : :
869 [ # # ]: 0 : if ( regs->num_of_match > 0 )
870 : 0 : return(0);
871 : : else
872 : 0 : return(-1);
873 : : }
874 : :
875 : : sal_Int32
876 : 0 : Regexpr::regcomp()
877 : : {
878 : 0 : bufp = (struct re_pattern_buffer *)malloc(sizeof(struct re_pattern_buffer));
879 [ # # ]: 0 : if ( bufp == NULL ) {
880 : 0 : return(-1);
881 : : }
882 : :
883 : 0 : bufp->buffer = 0;
884 : 0 : bufp->allocated = 0;
885 : 0 : bufp->used = 0;
886 : :
887 : : //bufp->fastmap = (sal_Unicode*) malloc((1 << BYTEWIDTH) * sizeof(sal_Unicode));
888 : : // No fastmap with Unicode
889 : 0 : bufp->fastmap = NULL;
890 : :
891 : 0 : return(regex_compile());
892 : : }
893 : :
894 : : sal_Int32
895 : 0 : Regexpr::regex_compile()
896 : : {
897 : : register sal_Unicode c, c1;
898 : : const sal_Unicode *p1;
899 : : register sal_Unicode *b;
900 : :
901 : : /* Keeps track of unclosed groups. */
902 : : compile_stack_type compile_stack;
903 : :
904 : : /* Points to the current (ending) position in the pattern. */
905 : 0 : const sal_Unicode *p = pattern;
906 : 0 : const sal_Unicode *pend = pattern + patsize;
907 : :
908 : : /* Address of the count-byte of the most recently inserted `exactn'
909 : : command. This makes it possible to tell if a new exact-match
910 : : character can be added to that command or if the character requires
911 : : a new `exactn' command. */
912 : 0 : sal_Unicode *pending_exact = 0;
913 : :
914 : : /* Address of start of the most recently finished expression.
915 : : This tells, e.g., postfix * where to find the start of its
916 : : operand. Reset at the beginning of groups and alternatives. */
917 : 0 : sal_Unicode *laststart = 0;
918 : :
919 : : /* Address of beginning of regexp, or inside of last group. */
920 : : sal_Unicode *begalt;
921 : :
922 : : /* Place in the uncompiled pattern (i.e., the {) to
923 : : which to go back if the interval is invalid. */
924 : : const sal_Unicode *beg_interval;
925 : :
926 : : /* Address of the place where a forward jump should go to the end of
927 : : the containing expression. Each alternative of an `or' -- except the
928 : : last -- ends with a forward jump of this sort. */
929 : 0 : sal_Unicode *fixup_alt_jump = 0;
930 : :
931 : : /* Counts open-groups as they are encountered. Remembered for the
932 : : matching close-group on the compile stack, so the same register
933 : : number is put in the stop_memory as the start_memory. */
934 : 0 : sal_Int32 regnum = 0;
935 : :
936 : : /* Initialize the compile stack. */
937 : 0 : compile_stack.stack = (compile_stack_elt_t *)malloc(INIT_COMPILE_STACK_SIZE * sizeof(compile_stack_elt_t));
938 [ # # ]: 0 : if (compile_stack.stack == NULL)
939 : 0 : return(REG_ESPACE);
940 : :
941 : 0 : compile_stack.size = INIT_COMPILE_STACK_SIZE;
942 : 0 : compile_stack.avail = 0;
943 : :
944 : : /* Initialize the pattern buffer. */
945 : 0 : bufp->fastmap_accurate = 0;
946 : 0 : bufp->not_bol = 0;
947 : 0 : bufp->not_eol = 0;
948 : 0 : bufp->newline_anchor = 1;
949 : :
950 : : /* Set `used' to zero, so that if we return an error, the pattern
951 : : printer (for debugging) will think there's no pattern. We reset it
952 : : at the end. */
953 : 0 : bufp->used = 0;
954 : :
955 : : /* Always count groups. */
956 : 0 : bufp->re_nsub = 0;
957 : :
958 [ # # ]: 0 : if (bufp->allocated == 0) {
959 [ # # ]: 0 : if (bufp->buffer) {
960 : : /* If zero allocated, but buffer is non-null, try to realloc
961 : : enough space. This loses if buffer's address is bogus, but
962 : : that is the user's responsibility. */
963 : 0 : bufp->buffer = (sal_Unicode *)realloc(bufp->buffer, INIT_BUF_SIZE * sizeof(sal_Unicode));
964 : : } else { /* Caller did not allocate a buffer. Do it for them. */
965 : 0 : bufp->buffer = (sal_Unicode *)malloc(INIT_BUF_SIZE * sizeof(sal_Unicode));
966 : : }
967 [ # # ]: 0 : if (!bufp->buffer) FREE_STACK_RETURN(REG_ESPACE);
968 : :
969 : 0 : bufp->allocated = INIT_BUF_SIZE;
970 : : }
971 : :
972 : 0 : begalt = b = bufp->buffer;
973 : :
974 : : /* Loop through the uncompiled pattern until we're at the end. */
975 [ # # ]: 0 : while (p != pend) {
976 [ # # ]: 0 : PATFETCH_RAW(c);
977 : :
978 [ # # # # : 0 : switch (c) {
# # # # #
# # # ]
979 : : case (sal_Unicode)'^': {
980 [ # # ][ # # ]: 0 : if ( /* If at start of pattern, it's an operator. */
[ # # ]
981 : : p == pattern + 1
982 : : /* Otherwise, depends on what's come before. */
983 [ # # ]: 0 : || at_begline_loc_p(pattern, p))
984 [ # # ][ # # ]: 0 : BUF_PUSH(begline);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
985 : : else
986 : 0 : goto normal_char;
987 : : }
988 : 0 : break;
989 : :
990 : : case (sal_Unicode)'$': {
991 [ # # ][ # # ]: 0 : if ( /* If at end of pattern, it's an operator. */
[ # # ]
992 : : p == pend
993 : : /* Otherwise, depends on what's next. */
994 [ # # ]: 0 : || at_endline_loc_p(p)) {
995 [ # # ][ # # ]: 0 : BUF_PUSH(endline);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
996 : : } else {
997 : 0 : goto normal_char;
998 : : }
999 : : }
1000 : 0 : break;
1001 : :
1002 : : case (sal_Unicode)'+':
1003 : : case (sal_Unicode)'?':
1004 : : case (sal_Unicode)'*':
1005 : : /* If there is no previous pattern... */
1006 [ # # ]: 0 : if (!laststart) {
1007 : 0 : goto normal_char;
1008 : : }
1009 : :
1010 : : {
1011 : : /* Are we optimizing this jump? */
1012 : 0 : sal_Bool keep_string_p = false;
1013 : :
1014 : : /* 1 means zero (many) matches is allowed. */
1015 : 0 : sal_Unicode zero_times_ok = 0, many_times_ok = 0;
1016 : :
1017 : : /* If there is a sequence of repetition chars, collapse it
1018 : : down to just one (the right one). We can't combine
1019 : : interval operators with these because of, e.g., `a{2}*',
1020 : : which should only match an even number of `a's. */
1021 : :
1022 : 0 : for (;;) {
1023 : 0 : zero_times_ok |= c != (sal_Unicode)'+';
1024 : 0 : many_times_ok |= c != (sal_Unicode)'?';
1025 : :
1026 [ # # ]: 0 : if (p == pend)
1027 : 0 : break;
1028 : :
1029 [ # # ]: 0 : PATFETCH_RAW(c);
1030 : :
1031 [ # # ][ # # ]: 0 : if (c == (sal_Unicode)'*' || (c == (sal_Unicode)'+'
[ # # ]
1032 : : || c == (sal_Unicode)'?')) {
1033 : : } else {
1034 : 0 : PATUNFETCH;
1035 : 0 : break;
1036 : : }
1037 : :
1038 : : /* If we get here, we found another repeat character. */
1039 : : }
1040 : :
1041 : : /* Star, etc. applied to an empty pattern is equivalent
1042 : : to an empty pattern. */
1043 [ # # ]: 0 : if (!laststart) {
1044 : 0 : break;
1045 : : }
1046 : :
1047 : : /* Now we know whether or not zero matches is allowed
1048 : : and also whether or not two or more matches is allowed. */
1049 [ # # ]: 0 : if (many_times_ok) {
1050 : : /* More than one repetition is allowed, so put in at the
1051 : : end a backward relative jump from `b' to before the next
1052 : : jump we're going to put in below (which jumps from
1053 : : laststart to after this jump).
1054 : :
1055 : : But if we are at the `*' in the exact sequence `.*\n',
1056 : : insert an unconditional jump backwards to the .,
1057 : : instead of the beginning of the loop. This way we only
1058 : : push a failure point once, instead of every time
1059 : : through the loop. */
1060 : : assert(p - 1 > pattern);
1061 : :
1062 : : /* Allocate the space for the jump. */
1063 [ # # ][ # # ]: 0 : GET_BUFFER_SPACE(3);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1064 : :
1065 : : /* We know we are not at the first character of the pattern,
1066 : : because laststart was nonzero. And we've already
1067 : : incremented `p', by the way, to be the character after
1068 : : the `*'. Do we have to do something analogous here
1069 : : for null bytes, because of RE_DOT_NOT_NULL? */
1070 [ # # ][ # # ]: 0 : if (*(p - 2) == (sal_Unicode)'.'
[ # # ][ # # ]
1071 : : && zero_times_ok
1072 : : && p < pend && *p == (sal_Unicode)'\n') {
1073 : : /* We have .*\n. */
1074 : 0 : STORE_JUMP(jump, b, laststart);
1075 : 0 : keep_string_p = true;
1076 : : } else {
1077 : : /* Anything else. */
1078 : 0 : STORE_JUMP(maybe_pop_jump, b, laststart - 3);
1079 : : }
1080 : :
1081 : : /* We've added more stuff to the buffer. */
1082 : 0 : b += 3;
1083 : : }
1084 : :
1085 : : /* On failure, jump from laststart to b + 3, which will be the
1086 : : end of the buffer after this jump is inserted. */
1087 [ # # ][ # # ]: 0 : GET_BUFFER_SPACE(3);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1088 [ # # ]: 0 : INSERT_JUMP(keep_string_p ? on_failure_keep_string_jump
1089 : : : on_failure_jump,
1090 [ # # ]: 0 : laststart, b + 3);
1091 : 0 : pending_exact = 0;
1092 : 0 : b += 3;
1093 : :
1094 [ # # ]: 0 : if (!zero_times_ok) {
1095 : : /* At least one repetition is required, so insert a
1096 : : `dummy_failure_jump' before the initial
1097 : : `on_failure_jump' instruction of the loop. This
1098 : : effects a skip over that instruction the first time
1099 : : we hit that loop. */
1100 [ # # ][ # # ]: 0 : GET_BUFFER_SPACE(3);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1101 [ # # ]: 0 : INSERT_JUMP(dummy_failure_jump, laststart, laststart + 6);
1102 : 0 : b += 3;
1103 : : }
1104 : : }
1105 : 0 : break;
1106 : :
1107 : : case (sal_Unicode)'.':
1108 : 0 : laststart = b;
1109 [ # # ][ # # ]: 0 : BUF_PUSH(anychar);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1110 : 0 : break;
1111 : :
1112 : :
1113 : : case (sal_Unicode)'[': {
1114 : 0 : sal_Bool have_range = false;
1115 : 0 : sal_Unicode last_char = 0xffff;
1116 : 0 : sal_Unicode first_range = 0xffff;
1117 : 0 : sal_Unicode second_range = 0xffff;
1118 : : sal_Int16 bsiz;
1119 : :
1120 [ # # ]: 0 : if (p == pend) FREE_STACK_RETURN(REG_EBRACK);
1121 : :
1122 : : /* Ensure that we have enough space to push a charset: the
1123 : : opcode, the length count, and the bitset;
1124 : : 1 + 1 + (1 << BYTEWIDTH) / BYTEWIDTH "bytes" in all. */
1125 : 0 : bsiz = 2 + ((1 << BYTEWIDTH) / BYTEWIDTH);
1126 [ # # ][ # # ]: 0 : GET_BUFFER_SPACE(bsiz);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1127 : :
1128 : 0 : laststart = b;
1129 : :
1130 : : /* We test `*p == '^' twice, instead of using an if
1131 : : statement, so we only need one BUF_PUSH. */
1132 [ # # ][ # # ]: 0 : BUF_PUSH (*p == (sal_Unicode)'^' ? charset_not : charset);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ]
1133 [ # # ]: 0 : if (*p == (sal_Unicode)'^')
1134 : 0 : p++;
1135 : :
1136 : : /* Remember the first position in the bracket expression. */
1137 : 0 : p1 = p;
1138 : :
1139 : : /* Push the number of "bytes" in the bitmap. */
1140 [ # # ][ # # ]: 0 : BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1141 : :
1142 : : /* Clear the whole map. */
1143 : 0 : memset(b, 0, ((1 << BYTEWIDTH) / BYTEWIDTH) * sizeof(sal_Unicode));
1144 : :
1145 : : /* Read in characters and ranges, setting map bits. */
1146 : 0 : for (;;) {
1147 [ # # ]: 0 : if (p == pend) FREE_STACK_RETURN(REG_EBRACK);
1148 : :
1149 [ # # ]: 0 : PATFETCH_RAW(c);
1150 : :
1151 [ # # ]: 0 : if ( c == (sal_Unicode)'\\' ) {
1152 : :
1153 [ # # ]: 0 : PATFETCH_RAW(c);
1154 : :
1155 [ # # ]: 0 : if ( c == (sal_Unicode)'x' ) {
1156 : 0 : sal_Int32 UniChar = -1;
1157 : :
1158 [ # # ][ # # ]: 0 : GET_HEX_NUMBER(UniChar);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ]
1159 [ # # ][ # # ]: 0 : if (UniChar < 0 || UniChar > 0xffff) FREE_STACK_RETURN(REG_BADPAT);
1160 : 0 : c = (sal_Unicode) UniChar;
1161 : 0 : last_char = c;
1162 [ # # ]: 0 : set_list_bit(last_char, b);
1163 : : } else {
1164 : 0 : last_char = c;
1165 [ # # ]: 0 : set_list_bit(last_char, b);
1166 : : }
1167 [ # # ]: 0 : } else if (c == (sal_Unicode)']') {
1168 : : /* Could be the end of the bracket expression. If it's
1169 : : not (i.e., when the bracket expression is `[]' so
1170 : : far), the ']' character bit gets set way below. */
1171 : 0 : break;
1172 [ # # ]: 0 : } else if ( c == (sal_Unicode)'-' ) {
1173 [ # # ]: 0 : if ( !have_range ) {
1174 [ # # ]: 0 : if ( last_char != 0xffff ) {
1175 : 0 : first_range = last_char;
1176 : 0 : have_range = true;
1177 : 0 : continue;
1178 : : } else {
1179 : 0 : last_char = (sal_Unicode)'-';
1180 [ # # ]: 0 : set_list_bit(last_char, b);
1181 : : }
1182 : : }
1183 : : }
1184 : :
1185 : : /* See if we're at the beginning of a possible character
1186 : : class. */
1187 [ # # ][ # # ]: 0 : else if (c == (sal_Unicode)':' && p[-2] == (sal_Unicode)'[') {
1188 : : /* Leave room for the null. */
1189 : : sal_Unicode str[CHAR_CLASS_MAX_LENGTH + 1];
1190 : :
1191 [ # # ]: 0 : PATFETCH_RAW(c);
1192 : 0 : c1 = 0;
1193 : :
1194 : : /* If pattern is `[[:'. */
1195 [ # # ]: 0 : if (p == pend) FREE_STACK_RETURN(REG_EBRACK);
1196 : :
1197 : 0 : str[c1++] = c;
1198 : 0 : for (;;) {
1199 [ # # ]: 0 : PATFETCH_RAW(c);
1200 [ # # ][ # # ]: 0 : if ((c == (sal_Unicode)':' && *p == (sal_Unicode)']') || p == pend)
[ # # ]
1201 : 0 : break;
1202 [ # # ]: 0 : if (c1 < CHAR_CLASS_MAX_LENGTH)
1203 : 0 : str[c1++] = c;
1204 : : else
1205 : : /* This is in any case an invalid class name. */
1206 : 0 : str[0] = (sal_Unicode)'\0';
1207 : : }
1208 : 0 : str[c1] = (sal_Unicode)'\0';
1209 : :
1210 : : /* If isn't a word bracketed by `[:' and `:]':
1211 : : undo the ending character, the letters, and leave
1212 : : the leading `:' and `[' (but set bits for them). */
1213 [ # # ][ # # ]: 0 : if (c == (sal_Unicode)':' && *p == (sal_Unicode)']') {
1214 : : sal_Int32 ch;
1215 : : // no support for GRAPH, PUNCT, or XDIGIT yet
1216 [ # # ]: 0 : sal_Bool is_alnum = STREQ(str, ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("alnum")).getStr());
1217 [ # # ]: 0 : sal_Bool is_alpha = STREQ(str, ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("alpha")).getStr());
1218 [ # # ]: 0 : sal_Bool is_cntrl = STREQ(str, ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cntrl")).getStr());
1219 [ # # ]: 0 : sal_Bool is_digit = STREQ(str, ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("digit")).getStr());
1220 [ # # ]: 0 : sal_Bool is_lower = STREQ(str, ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("lower")).getStr());
1221 [ # # ]: 0 : sal_Bool is_print = STREQ(str, ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("print")).getStr());
1222 [ # # ]: 0 : sal_Bool is_space = STREQ(str, ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("space")).getStr());
1223 [ # # ]: 0 : sal_Bool is_upper = STREQ(str, ::rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("upper")).getStr());
1224 : :
1225 [ # # ]: 0 : if (!(is_alnum || is_alpha || is_cntrl ||
1226 [ # # ][ # # ]: 0 : is_digit || is_lower || is_print || is_space || is_upper) )
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ]
1227 : 0 : FREE_STACK_RETURN(REG_ECTYPE);
1228 : :
1229 : : /* Throw away the ] at the end of the character
1230 : : class. */
1231 [ # # ]: 0 : PATFETCH_RAW(c);
1232 : :
1233 [ # # ]: 0 : if (p == pend) FREE_STACK_RETURN(REG_EBRACK);
1234 : :
1235 [ # # ]: 0 : for (ch = 0; ch < 1 << BYTEWIDTH; ch++) {
1236 : : /* This was split into 3 if's to
1237 : : avoid an arbitrary limit in some compiler. */
1238 [ # # ][ # # ]: 0 : if ( (is_alnum && unicode::isAlphaDigit(sal_Unicode(ch))) ||
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1239 [ # # ]: 0 : (is_alpha && unicode::isAlpha(sal_Unicode(ch))) ||
1240 [ # # ]: 0 : (is_cntrl && unicode::isControl(sal_Unicode(ch))))
1241 [ # # ]: 0 : set_list_bit(sal_Unicode(ch), b);
1242 [ # # ][ # # ]: 0 : if ( (is_digit && unicode::isDigit(sal_Unicode(ch))) ||
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1243 [ # # ]: 0 : (is_lower && unicode::isLower(sal_Unicode(ch))) ||
1244 [ # # ]: 0 : (is_print && unicode::isPrint(sal_Unicode(ch))))
1245 [ # # ]: 0 : set_list_bit(sal_Unicode(ch), b);
1246 [ # # ][ # # ]: 0 : if ( (is_space && unicode::isSpace(sal_Unicode(ch))) ||
[ # # ][ # # ]
[ # # ][ # # ]
1247 [ # # ]: 0 : (is_upper && unicode::isUpper(sal_Unicode(ch))) )
1248 [ # # ]: 0 : set_list_bit(sal_Unicode(ch), b);
1249 [ # # ][ # # ]: 0 : if ( isIgnoreCase && (is_upper || is_lower) &&
[ # # ][ # # ]
[ # # ][ # # ]
1250 [ # # ][ # # ]: 0 : (unicode::isUpper(sal_Unicode(ch)) || unicode::isLower(sal_Unicode(ch))))
1251 [ # # ]: 0 : set_list_bit(sal_Unicode(ch), b);
1252 : : }
1253 : : break;
1254 : : } else {
1255 : 0 : p = p1+1;
1256 : 0 : p1++;
1257 : 0 : last_char = (sal_Unicode)':';
1258 [ # # ]: 0 : set_list_bit(last_char, b);
1259 : 0 : }
1260 : : } else {
1261 : 0 : last_char = c;
1262 [ # # ]: 0 : set_list_bit(last_char, b);
1263 : : }
1264 [ # # ]: 0 : if ( have_range ) {
1265 [ # # ]: 0 : if ( last_char != 0xffff ) {
1266 : 0 : second_range = last_char;
1267 : 0 : have_range = false;
1268 [ # # ]: 0 : compile_range(first_range, second_range, b);
1269 : 0 : } else FREE_STACK_RETURN(REG_EBRACK);
1270 : : } else {
1271 [ # # ]: 0 : if ( last_char != 0xffff ) {
1272 [ # # ]: 0 : set_list_bit(last_char, b);
1273 : 0 : } else FREE_STACK_RETURN(REG_EBRACK);
1274 : : }
1275 : : }
1276 : :
1277 : : /* Discard any (non)matching list bytes that are all 0 at the
1278 : : end of the map. Decrease the map-length byte too. */
1279 : 0 : bsiz = b[-1];
1280 [ # # ][ # # ]: 0 : while ((sal_Int16) bsiz > 0 && b[bsiz - 1] == 0)
[ # # ]
1281 : 0 : bsiz--;
1282 : 0 : b[-1] = (sal_Unicode)bsiz;
1283 : 0 : b += bsiz;
1284 : : }
1285 : 0 : break;
1286 : :
1287 : : case (sal_Unicode)'(':
1288 : 0 : goto handle_open;
1289 : :
1290 : : case (sal_Unicode)')':
1291 : 0 : goto handle_close;
1292 : :
1293 : : case (sal_Unicode)'\n':
1294 : 0 : goto normal_char;
1295 : :
1296 : : case (sal_Unicode)'|':
1297 : 0 : goto handle_alt;
1298 : :
1299 : : case (sal_Unicode)'{':
1300 : 0 : goto handle_interval;
1301 : :
1302 : : case (sal_Unicode)'\\':
1303 [ # # ]: 0 : if (p == pend) FREE_STACK_RETURN(REG_EESCAPE);
1304 : :
1305 : : /* Do not translate the character after the \, so that we can
1306 : : distinguish, e.g., \B from \b, even if we normally would
1307 : : translate, e.g., B to b. */
1308 [ # # ]: 0 : PATFETCH_RAW(c);
1309 : :
1310 [ # # # # : 0 : switch (c) {
# # # # #
# # # #
# ]
1311 : : case (sal_Unicode)'(':
1312 : 0 : goto normal_backslash;
1313 : :
1314 : : handle_open:
1315 : 0 : bufp->re_nsub++;
1316 : 0 : regnum++;
1317 : :
1318 [ # # ]: 0 : if (COMPILE_STACK_FULL) {
1319 : 0 : compile_stack.stack = (compile_stack_elt_t *)realloc(compile_stack.stack, (compile_stack.size << 1) * sizeof(compile_stack_elt_t));
1320 [ # # ]: 0 : if (compile_stack.stack == NULL) return(REG_ESPACE);
1321 : :
1322 : 0 : compile_stack.size <<= 1;
1323 : : }
1324 : :
1325 : : /* These are the values to restore when we hit end of this
1326 : : group. They are all relative offsets, so that if the
1327 : : whole pattern moves because of realloc, they will still
1328 : : be valid. */
1329 : 0 : COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
1330 : 0 : COMPILE_STACK_TOP.fixup_alt_jump
1331 [ # # ]: 0 : = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1332 : 0 : COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
1333 : 0 : COMPILE_STACK_TOP.regnum = regnum;
1334 : :
1335 : : /* We will eventually replace the 0 with the number of
1336 : : groups inner to this one. But do not push a
1337 : : start_memory for groups beyond the last one we can
1338 : : represent in the compiled pattern. */
1339 [ # # ]: 0 : if (regnum <= MAX_REGNUM) {
1340 : 0 : COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
1341 [ # # ][ # # ]: 0 : BUF_PUSH_3 (start_memory, regnum, 0);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1342 : : }
1343 : :
1344 : 0 : compile_stack.avail++;
1345 : :
1346 : 0 : fixup_alt_jump = 0;
1347 : 0 : laststart = 0;
1348 : 0 : begalt = b;
1349 : : /* If we've reached MAX_REGNUM groups, then this open
1350 : : won't actually generate any code, so we'll have to
1351 : : clear pending_exact explicitly. */
1352 : 0 : pending_exact = 0;
1353 : 0 : break;
1354 : :
1355 : :
1356 : : case (sal_Unicode)')':
1357 : 0 : goto normal_backslash;
1358 : :
1359 : : handle_close:
1360 [ # # ]: 0 : if (fixup_alt_jump) {
1361 : : /* Push a dummy failure point at the end of the
1362 : : alternative for a possible future
1363 : : `pop_failure_jump' to pop. See comments at
1364 : : `push_dummy_failure' in `re_match2'. */
1365 [ # # ][ # # ]: 0 : BUF_PUSH(push_dummy_failure);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1366 : :
1367 : : /* We allocated space for this jump when we assigned
1368 : : to `fixup_alt_jump', in the `handle_alt' case below. */
1369 : 0 : STORE_JUMP(jump_past_alt, fixup_alt_jump, b - 1);
1370 : : }
1371 : :
1372 : : /* See similar code for backslashed left paren above. */
1373 [ # # ]: 0 : if (COMPILE_STACK_EMPTY) {
1374 : 0 : FREE_STACK_RETURN(REG_ERPAREN);
1375 : : }
1376 : :
1377 : : /* Since we just checked for an empty stack above, this
1378 : : ``can't happen''. */
1379 : : assert (compile_stack.avail != 0);
1380 : :
1381 : : {
1382 : : /* We don't just want to restore into `regnum', because
1383 : : later groups should continue to be numbered higher,
1384 : : as in `(ab)c(de)' -- the second group is #2. */
1385 : : sal_Int32 this_group_regnum;
1386 : :
1387 : 0 : compile_stack.avail--;
1388 : 0 : begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
1389 : : fixup_alt_jump
1390 : 0 : = COMPILE_STACK_TOP.fixup_alt_jump
1391 : 0 : ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
1392 [ # # ]: 0 : : 0;
1393 : 0 : laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
1394 : 0 : this_group_regnum = COMPILE_STACK_TOP.regnum;
1395 : : /* If we've reached MAX_REGNUM groups, then this open
1396 : : won't actually generate any code, so we'll have to
1397 : : clear pending_exact explicitly. */
1398 : 0 : pending_exact = 0;
1399 : :
1400 : : /* We're at the end of the group, so now we know how many
1401 : : groups were inside this one. */
1402 [ # # ]: 0 : if (this_group_regnum <= MAX_REGNUM) {
1403 : : sal_Unicode *inner_group_loc
1404 : 0 : = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
1405 : :
1406 : 0 : *inner_group_loc = sal::static_int_cast<sal_Unicode>( regnum - this_group_regnum );
1407 [ # # ][ # # ]: 0 : BUF_PUSH_3 (stop_memory, this_group_regnum,
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1408 : : regnum - this_group_regnum);
1409 : : }
1410 : : }
1411 : 0 : break;
1412 : :
1413 : :
1414 : : case (sal_Unicode)'|': /* `\|'.
1415 : : * */
1416 : 0 : goto normal_backslash;
1417 : : handle_alt:
1418 : :
1419 : : /* Insert before the previous alternative a jump which
1420 : : jumps to this alternative if the former fails. */
1421 [ # # ][ # # ]: 0 : GET_BUFFER_SPACE (3);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1422 [ # # ]: 0 : INSERT_JUMP (on_failure_jump, begalt, b + 6);
1423 : 0 : pending_exact = 0;
1424 : 0 : b += 3;
1425 : :
1426 : : /* The alternative before this one has a jump after it
1427 : : which gets executed if it gets matched. Adjust that
1428 : : jump so it will jump to this alternative's analogous
1429 : : jump (put in below, which in turn will jump to the next
1430 : : (if any) alternative's such jump, etc.). The last such
1431 : : jump jumps to the correct final destination. A picture:
1432 : : _____ _____
1433 : : | | | |
1434 : : | v | v
1435 : : a | b | c
1436 : :
1437 : : If we are at `b', then fixup_alt_jump right now points to a
1438 : : three-byte space after `a'. We'll put in the jump, set
1439 : : fixup_alt_jump to right after `b', and leave behind three
1440 : : bytes which we'll fill in when we get to after `c'. */
1441 : :
1442 [ # # ]: 0 : if (fixup_alt_jump)
1443 : 0 : STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
1444 : :
1445 : : /* Mark and leave space for a jump after this alternative,
1446 : : to be filled in later either by next alternative or
1447 : : when know we're at the end of a series of alternatives. */
1448 : 0 : fixup_alt_jump = b;
1449 [ # # ][ # # ]: 0 : GET_BUFFER_SPACE (3);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1450 : 0 : b += 3;
1451 : :
1452 : 0 : laststart = 0;
1453 : 0 : begalt = b;
1454 : 0 : break;
1455 : :
1456 : :
1457 : : case (sal_Unicode)'{':
1458 : 0 : goto normal_backslash;
1459 : :
1460 : : handle_interval:
1461 : : {
1462 : : /* allows intervals. */
1463 : : /* At least (most) this many matches must be made. */
1464 : 0 : sal_Int32 lower_bound = -1, upper_bound = -1;
1465 : :
1466 : 0 : beg_interval = p - 1;
1467 : :
1468 [ # # ]: 0 : if (p == pend) {
1469 : 0 : goto unfetch_interval;
1470 : : }
1471 : :
1472 [ # # ][ # # ]: 0 : GET_UNSIGNED_NUMBER(lower_bound);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1473 : :
1474 [ # # ]: 0 : if (c == (sal_Unicode)',') {
1475 [ # # ][ # # ]: 0 : GET_UNSIGNED_NUMBER(upper_bound);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1476 [ # # ]: 0 : if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1477 : : } else
1478 : : /* Interval such as `{1}' => match exactly once. */
1479 : 0 : upper_bound = lower_bound;
1480 : :
1481 [ # # ][ # # ]: 0 : if (lower_bound < 0 || upper_bound > RE_DUP_MAX
[ # # ]
1482 : : || lower_bound > upper_bound) {
1483 : : goto unfetch_interval;
1484 : : }
1485 : :
1486 [ # # ]: 0 : if (c != (sal_Unicode)'}') {
1487 : 0 : goto unfetch_interval;
1488 : : }
1489 : :
1490 : : /* We just parsed a valid interval. */
1491 : :
1492 : : /* If it's invalid to have no preceding re. */
1493 [ # # ]: 0 : if (!laststart) {
1494 : 0 : goto unfetch_interval;
1495 : : }
1496 : :
1497 : : /* If the upper bound is zero, don't want to succeed at
1498 : : all; jump from `laststart' to `b + 3', which will be
1499 : : the end of the buffer after we insert the jump. */
1500 [ # # ]: 0 : if (upper_bound == 0) {
1501 [ # # ][ # # ]: 0 : GET_BUFFER_SPACE(3);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1502 [ # # ]: 0 : INSERT_JUMP(jump, laststart, b + 3);
1503 : 0 : b += 3;
1504 : : }
1505 : :
1506 : : /* Otherwise, we have a nontrivial interval. When
1507 : : we're all done, the pattern will look like:
1508 : : set_number_at <jump count> <upper bound>
1509 : : set_number_at <succeed_n count> <lower bound>
1510 : : succeed_n <after jump addr> <succeed_n count>
1511 : : <body of loop>
1512 : : jump_n <succeed_n addr> <jump count>
1513 : : (The upper bound and `jump_n' are omitted if
1514 : : `upper_bound' is 1, though.) */
1515 : : else {
1516 : : /* If the upper bound is > 1, we need to insert
1517 : : more at the end of the loop. */
1518 [ # # ]: 0 : unsigned nbytes = 10 + (upper_bound > 1) * 10;
1519 : :
1520 [ # # ][ # # ]: 0 : GET_BUFFER_SPACE(nbytes);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1521 : :
1522 : : /* Initialize lower bound of the `succeed_n', even
1523 : : though it will be set during matching by its
1524 : : attendant `set_number_at' (inserted next),
1525 : : because `re_compile_fastmap' needs to know.
1526 : : Jump to the `jump_n' we might insert below. */
1527 [ # # ]: 0 : INSERT_JUMP2(succeed_n, laststart,
1528 : : b + 5 + (upper_bound > 1) * 5,
1529 [ # # ]: 0 : lower_bound);
1530 : 0 : b += 5;
1531 : :
1532 : : /* Code to initialize the lower bound. Insert
1533 : : before the `succeed_n'. The `5' is the last two
1534 : : bytes of this `set_number_at', plus 3 bytes of
1535 : : the following `succeed_n'. */
1536 [ # # ]: 0 : insert_op2(set_number_at, laststart, 5, lower_bound, b);
1537 : 0 : b += 5;
1538 : :
1539 [ # # ]: 0 : if (upper_bound > 1) {
1540 : : /* More than one repetition is allowed, so
1541 : : append a backward jump to the `succeed_n'
1542 : : that starts this interval.
1543 : :
1544 : : When we've reached this during matching,
1545 : : we'll have matched the interval once, so
1546 : : jump back only `upper_bound - 1' times. */
1547 : 0 : STORE_JUMP2(jump_n, b, laststart + 5,
1548 : 0 : upper_bound - 1);
1549 : 0 : b += 5;
1550 : :
1551 : : /* The location we want to set is the second
1552 : : parameter of the `jump_n'; that is `b-2' as
1553 : : an absolute address. `laststart' will be
1554 : : the `set_number_at' we're about to insert;
1555 : : `laststart+3' the number to set, the source
1556 : : for the relative address. But we are
1557 : : inserting into the middle of the pattern --
1558 : : so everything is getting moved up by 5.
1559 : : Conclusion: (b - 2) - (laststart + 3) + 5,
1560 : : i.e., b - laststart.
1561 : :
1562 : : We insert this at the beginning of the loop
1563 : : so that if we fail during matching, we'll
1564 : : reinitialize the bounds. */
1565 : : insert_op2(set_number_at, laststart, b - laststart,
1566 [ # # ]: 0 : upper_bound - 1, b);
1567 : 0 : b += 5;
1568 : : }
1569 : : }
1570 : 0 : pending_exact = 0;
1571 : 0 : beg_interval = NULL;
1572 : : }
1573 : 0 : break;
1574 : :
1575 : : unfetch_interval:
1576 : : /* If an invalid interval, match the characters as literals. */
1577 : : assert (beg_interval);
1578 : 0 : p = beg_interval;
1579 : 0 : beg_interval = NULL;
1580 : :
1581 : : /* normal_char and normal_backslash need `c'. */
1582 [ # # ]: 0 : PATFETCH_RAW(c);
1583 : :
1584 : 0 : goto normal_char;
1585 : :
1586 : : case (sal_Unicode)'`':
1587 [ # # ][ # # ]: 0 : BUF_PUSH(begbuf);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1588 : 0 : break;
1589 : :
1590 : : case (sal_Unicode)'\'':
1591 [ # # ][ # # ]: 0 : BUF_PUSH(endbuf);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1592 : 0 : break;
1593 : :
1594 : : case (sal_Unicode)'1': case (sal_Unicode)'2':
1595 : : case (sal_Unicode)'3': case (sal_Unicode)'4':
1596 : : case (sal_Unicode)'5': case (sal_Unicode)'6':
1597 : : case (sal_Unicode)'7': case (sal_Unicode)'8':
1598 : : case (sal_Unicode)'9':
1599 : 0 : c1 = c - (sal_Unicode)'0';
1600 : :
1601 [ # # ]: 0 : if (c1 > regnum)
1602 : 0 : FREE_STACK_RETURN(REG_ESUBREG);
1603 : :
1604 : : /* Can't back reference to a subexpression if inside of it. */
1605 [ # # ][ # # ]: 0 : if (group_in_compile_stack(compile_stack, (sal_uInt32) c1)) {
1606 : 0 : goto normal_char;
1607 : : }
1608 : :
1609 : 0 : laststart = b;
1610 [ # # ][ # # ]: 0 : BUF_PUSH_2(duplicate, c1);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1611 : 0 : break;
1612 : :
1613 : :
1614 : : case (sal_Unicode)'+':
1615 : : case (sal_Unicode)'?':
1616 : 0 : goto normal_backslash;
1617 : :
1618 : : case (sal_Unicode)'x': // Unicode char
1619 : : {
1620 : 0 : sal_Int32 UniChar = -1;
1621 : :
1622 [ # # ][ # # ]: 0 : GET_HEX_NUMBER(UniChar);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ]
1623 [ # # ][ # # ]: 0 : if (UniChar < 0 || UniChar > 0xffff) FREE_STACK_RETURN(REG_BADPAT);
1624 : 0 : c = (sal_Unicode) UniChar;
1625 : 0 : goto normal_char;
1626 : : }
1627 : : // break; // unreachable - see goto above
1628 : :
1629 : : case (sal_Unicode)'<': // begin Word boundary
1630 [ # # ][ # # ]: 0 : BUF_PUSH(wordbeg);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1631 : 0 : break;
1632 : :
1633 : : case (sal_Unicode)'>': // end Word boundary
1634 [ # # ][ # # ]: 0 : BUF_PUSH(wordend);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1635 : 0 : break;
1636 : :
1637 : : case (sal_Unicode)'n':
1638 : 0 : c = 0x0a;
1639 : 0 : goto normal_char;
1640 : :
1641 : : case (sal_Unicode)'t':
1642 : 0 : c = 0x09;
1643 : 0 : goto normal_char;
1644 : :
1645 : : default:
1646 : : normal_backslash:
1647 : 0 : goto normal_char;
1648 : : }
1649 : 0 : break;
1650 : :
1651 : : default:
1652 : : /* Expects the character in `c'. */
1653 : : normal_char:
1654 : : /* If no exactn currently being built. */
1655 [ # # ][ # # ]: 0 : if ( pending_exact == NULL
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1656 : :
1657 : : /* If last exactn not at current position. */
1658 : 0 : || pending_exact + *pending_exact + 1 != b
1659 : :
1660 : : /* We have only one sal_Unicode char following the
1661 : : exactn for the count. */
1662 : : || *pending_exact == (1 << BYTEWIDTH) - 1
1663 : :
1664 : : /* If followed by a repetition operator. */
1665 : : || *p == (sal_Unicode)'*' || *p == (sal_Unicode)'^'
1666 : : || *p == (sal_Unicode)'+' || *p == (sal_Unicode)'?'
1667 : : || *p == (sal_Unicode) '{' ) {
1668 : : /* Start building a new exactn. */
1669 : 0 : laststart = b;
1670 [ # # ][ # # ]: 0 : BUF_PUSH_2(exactn, 0);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1671 : 0 : pending_exact = b - 1;
1672 : : }
1673 : :
1674 [ # # ]: 0 : if ( translate ) {
1675 : : try {
1676 [ # # ][ # # ]: 0 : sal_Unicode tmp = translit->transliterateChar2Char(c);
1677 [ # # ][ # # ]: 0 : BUF_PUSH(tmp);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1678 : 0 : (*pending_exact)++;
1679 [ # # # # : 0 : } catch (const ::com::sun::star::i18n::MultipleCharsOutputException&) {
# # ]
1680 [ # # # # ]: 0 : ::rtl::OUString o2( translit->transliterateChar2String( c));
1681 : 0 : sal_Int32 len2 = o2.getLength();
1682 : 0 : const sal_Unicode * k2 = o2.getStr();
1683 [ # # ]: 0 : for (sal_Int32 nmatch = 0; nmatch < len2; nmatch++) {
1684 [ # # # # : 0 : BUF_PUSH(k2[nmatch]);
# # # # #
# # # # #
# # ]
1685 : 0 : (*pending_exact)++;
1686 [ # # ]: 0 : }
1687 : : }
1688 : : } else {
1689 [ # # ][ # # ]: 0 : BUF_PUSH(c);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1690 : 0 : (*pending_exact)++;
1691 : : }
1692 : 0 : break;
1693 : : } /* switch (c) */
1694 : : } /* while p != pend */
1695 : :
1696 : : /* Through the pattern now. */
1697 : :
1698 [ # # ]: 0 : if (fixup_alt_jump)
1699 : 0 : STORE_JUMP(jump_past_alt, fixup_alt_jump, b);
1700 : :
1701 [ # # ]: 0 : if (!COMPILE_STACK_EMPTY)
1702 : 0 : FREE_STACK_RETURN(REG_EPAREN);
1703 : :
1704 : : // Assumes no backtracking
1705 [ # # ][ # # ]: 0 : BUF_PUSH(succeed);
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
1706 : :
1707 [ # # ]: 0 : if ( compile_stack.stack )
1708 : 0 : free(compile_stack.stack);
1709 : 0 : compile_stack.stack = NULL;
1710 : :
1711 : : /* We have succeeded; set the length of the buffer. */
1712 : 0 : bufp->used = b - bufp->buffer;
1713 : :
1714 : 0 : return REG_NOERROR;
1715 : : } /* regex_compile */
1716 : :
1717 : : /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
1718 : : bytes; nonzero otherwise. */
1719 : :
1720 : : sal_Int32
1721 : 0 : Regexpr::bcmp_translate(const sal_Unicode *s1, const sal_Unicode *s2, sal_Int32 len)
1722 : : {
1723 [ # # ]: 0 : for (sal_Int32 nmatch = 0; nmatch < len; nmatch++) {
1724 [ # # ]: 0 : if (*s1++ != *s2++) {
1725 : 0 : return(1);
1726 : : }
1727 : : }
1728 : :
1729 : 0 : return(0);
1730 : : }
1731 : :
1732 : :
1733 : : /* We are passed P pointing to a register number after a start_memory.
1734 : :
1735 : : Return true if the pattern up to the corresponding stop_memory can
1736 : : match the empty string, and false otherwise.
1737 : :
1738 : : If we find the matching stop_memory, sets P to point to one past its number.
1739 : : Otherwise, sets P to an undefined byte less than or equal to END.
1740 : :
1741 : : We don't handle duplicates properly (yet). */
1742 : :
1743 : : sal_Bool
1744 : 0 : Regexpr::group_match_null_string_p(sal_Unicode **p, sal_Unicode *end, register_info_type *reg_info)
1745 : : {
1746 : : sal_Int32 mcnt;
1747 : : /* Point to after the args to the start_memory. */
1748 : 0 : sal_Unicode *p1 = *p + 2;
1749 : :
1750 [ # # ]: 0 : while (p1 < end) {
1751 : : /* Skip over opcodes that can match nothing, and return true or
1752 : : false, as appropriate, when we get to one that can't, or to the
1753 : : matching stop_memory. */
1754 : :
1755 [ # # # ]: 0 : switch ((re_opcode_t) *p1) {
1756 : : /* Could be either a loop or a series of alternatives. */
1757 : : case on_failure_jump:
1758 : 0 : p1++;
1759 : 0 : extract_number_and_incr(mcnt, p1);
1760 : :
1761 : : /* If the next operation is not a jump backwards in the
1762 : : pattern. */
1763 : :
1764 [ # # ]: 0 : if (mcnt >= 0) {
1765 : : /* Go through the on_failure_jumps of the alternatives,
1766 : : seeing if any of the alternatives cannot match nothing.
1767 : : The last alternative starts with only a jump,
1768 : : whereas the rest start with on_failure_jump and end
1769 : : with a jump, e.g., here is the pattern for `a|b|c':
1770 : :
1771 : : /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
1772 : : /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
1773 : : /exactn/1/c
1774 : :
1775 : : So, we have to first go through the first (n-1)
1776 : : alternatives and then deal with the last one separately. */
1777 : :
1778 : :
1779 : : /* Deal with the first (n-1) alternatives, which start
1780 : : with an on_failure_jump (see above) that jumps to right
1781 : : past a jump_past_alt. */
1782 : :
1783 [ # # ]: 0 : while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) {
1784 : : /* `mcnt' holds how many bytes long the alternative
1785 : : is, including the ending `jump_past_alt' and
1786 : : its number. */
1787 : :
1788 [ # # ][ # # ]: 0 : if (!alt_match_null_string_p(p1, p1 + mcnt - 3, reg_info))
1789 : 0 : return false;
1790 : :
1791 : : /* Move to right after this alternative, including the
1792 : : jump_past_alt. */
1793 : 0 : p1 += mcnt;
1794 : :
1795 : : /* Break if it's the beginning of an n-th alternative
1796 : : that doesn't begin with an on_failure_jump. */
1797 [ # # ]: 0 : if ((re_opcode_t) *p1 != on_failure_jump)
1798 : 0 : break;
1799 : :
1800 : : /* Still have to check that it's not an n-th
1801 : : alternative that starts with an on_failure_jump. */
1802 : 0 : p1++;
1803 : 0 : extract_number_and_incr(mcnt, p1);
1804 [ # # ]: 0 : if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) {
1805 : : /* Get to the beginning of the n-th alternative. */
1806 : 0 : p1 -= 3;
1807 : 0 : break;
1808 : : }
1809 : : }
1810 : :
1811 : : /* Deal with the last alternative: go back and get number
1812 : : of the `jump_past_alt' just before it. `mcnt' contains
1813 : : the length of the alternative. */
1814 : 0 : extract_number(mcnt, p1 - 2);
1815 : :
1816 [ # # ][ # # ]: 0 : if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
1817 : 0 : return false;
1818 : :
1819 : 0 : p1 += mcnt; /* Get past the n-th alternative. */
1820 : : } /* if mcnt > 0 */
1821 : 0 : break;
1822 : :
1823 : :
1824 : : case stop_memory:
1825 : : assert (p1[1] == **p);
1826 : 0 : *p = p1 + 2;
1827 : 0 : return true;
1828 : :
1829 : :
1830 : : default:
1831 [ # # ][ # # ]: 0 : if (!common_op_match_null_string_p(&p1, end, reg_info))
1832 : 0 : return false;
1833 : : }
1834 : : } /* while p1 < end */
1835 : :
1836 : 0 : return false;
1837 : : } /* group_match_null_string_p */
1838 : :
1839 : : /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
1840 : : It expects P to be the first byte of a single alternative and END one
1841 : : byte past the last. The alternative can contain groups. */
1842 : :
1843 : : sal_Bool
1844 : 0 : Regexpr::alt_match_null_string_p(sal_Unicode *p, sal_Unicode *end, register_info_type *reg_info)
1845 : : {
1846 : : sal_Int32 mcnt;
1847 : 0 : sal_Unicode *p1 = p;
1848 : :
1849 [ # # ]: 0 : while (p1 < end) {
1850 : : /* Skip over opcodes that can match nothing, and break when we get
1851 : : to one that can't. */
1852 : :
1853 [ # # ]: 0 : switch ((re_opcode_t) *p1) {
1854 : : /* It's a loop. */
1855 : : case on_failure_jump:
1856 : 0 : p1++;
1857 : 0 : extract_number_and_incr(mcnt, p1);
1858 : 0 : p1 += mcnt;
1859 : 0 : break;
1860 : :
1861 : : default:
1862 [ # # ][ # # ]: 0 : if (!common_op_match_null_string_p(&p1, end, reg_info))
1863 : 0 : return false;
1864 : : }
1865 : : } /* while p1 < end */
1866 : :
1867 : 0 : return true;
1868 : : } /* alt_match_null_string_p */
1869 : :
1870 : :
1871 : : /* Deals with the ops common to group_match_null_string_p and
1872 : : alt_match_null_string_p.
1873 : :
1874 : : Sets P to one after the op and its arguments, if any. */
1875 : :
1876 : : sal_Bool
1877 : 0 : Regexpr::common_op_match_null_string_p(sal_Unicode **p, sal_Unicode *end, register_info_type *reg_info)
1878 : : {
1879 : : sal_Int32 mcnt;
1880 : : sal_Bool ret;
1881 : : sal_Int32 reg_no;
1882 : 0 : sal_Unicode *p1 = *p;
1883 : :
1884 [ # # # # : 0 : switch ((re_opcode_t) *p1++) {
# # # ]
1885 : : case no_op:
1886 : : case begline:
1887 : : case endline:
1888 : : case begbuf:
1889 : : case endbuf:
1890 : 0 : break;
1891 : :
1892 : : case start_memory:
1893 : 0 : reg_no = *p1;
1894 : : assert (reg_no > 0 && reg_no <= MAX_REGNUM);
1895 [ # # ]: 0 : ret = group_match_null_string_p(&p1, end, reg_info);
1896 : : /* Have to set this here in case we're checking a group which
1897 : : contains a group and a back reference to it. */
1898 : :
1899 [ # # ]: 0 : if (REG_MATCH_NULL_STRING_P(reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
1900 : 0 : REG_MATCH_NULL_STRING_P(reg_info[reg_no]) = ret;
1901 : :
1902 [ # # ]: 0 : if (!ret)
1903 : 0 : return false;
1904 : 0 : break;
1905 : :
1906 : : /* If this is an optimized succeed_n for zero times, make the jump. */
1907 : : case jump:
1908 : 0 : extract_number_and_incr(mcnt, p1);
1909 [ # # ]: 0 : if (mcnt >= 0)
1910 : 0 : p1 += mcnt;
1911 : : else
1912 : 0 : return false;
1913 : 0 : break;
1914 : :
1915 : : case succeed_n:
1916 : : /* Get to the number of times to succeed. */
1917 : 0 : p1 += 2;
1918 : 0 : extract_number_and_incr(mcnt, p1);
1919 : :
1920 [ # # ]: 0 : if (mcnt == 0)
1921 : : {
1922 : 0 : p1 -= 4;
1923 : 0 : extract_number_and_incr(mcnt, p1);
1924 : 0 : p1 += mcnt;
1925 : : }
1926 : : else
1927 : 0 : return false;
1928 : 0 : break;
1929 : :
1930 : : case duplicate:
1931 [ # # ]: 0 : if (!REG_MATCH_NULL_STRING_P(reg_info[*p1]))
1932 : 0 : return false;
1933 : 0 : break;
1934 : :
1935 : : case set_number_at:
1936 : 0 : p1 += 4;
1937 : :
1938 : : default:
1939 : : /* All other opcodes mean we cannot match the empty string. */
1940 : 0 : return false;
1941 : : }
1942 : :
1943 : 0 : *p = p1;
1944 : 0 : return true;
1945 : : } /* common_op_match_null_string_p */
1946 : :
1947 : :
1948 : :
1949 : : /* Free everything we malloc. */
1950 : : #ifdef MATCH_MAY_ALLOCATE
1951 : : # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
1952 : : # define FREE_VARIABLES() \
1953 : : do { \
1954 : : REGEX_FREE_STACK (fail_stack.stack); \
1955 : : FREE_VAR (regstart); \
1956 : : FREE_VAR (regend); \
1957 : : FREE_VAR (old_regstart); \
1958 : : FREE_VAR (old_regend); \
1959 : : FREE_VAR (best_regstart); \
1960 : : FREE_VAR (best_regend); \
1961 : : FREE_VAR (reg_info); \
1962 : : FREE_VAR (reg_dummy); \
1963 : : FREE_VAR (reg_info_dummy); \
1964 : : } while (0)
1965 : : #else
1966 : : # define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
1967 : : #endif /* not MATCH_MAY_ALLOCATE */
1968 : :
1969 : : /* This is a separate function so that we can force an alloca cleanup
1970 : : afterwards. */
1971 : : sal_Int32
1972 : 0 : Regexpr::re_match2(struct re_registers *regs, sal_Int32 pos, sal_Int32 range)
1973 : : {
1974 : : /* General temporaries. */
1975 : : sal_Int32 mcnt;
1976 : : sal_Unicode *p1;
1977 : :
1978 : : /* Just past the end of the corresponding string. */
1979 : : sal_Unicode *end2;
1980 : :
1981 : : /* Pointers into string2, just past the last characters in
1982 : : each to consider matching. */
1983 : : sal_Unicode *end_match_2;
1984 : :
1985 : : /* Where we are in the data, and the end of the current string. */
1986 : : const sal_Unicode *d, *dend;
1987 : :
1988 : : /* Where we are in the compiled pattern, and the end of the compiled
1989 : : pattern. */
1990 : 0 : sal_Unicode *p = bufp->buffer;
1991 : 0 : register sal_Unicode *pend = p + bufp->used;
1992 : :
1993 : : /* Mark the opcode just after a start_memory, so we can test for an
1994 : : empty subpattern when we get to the stop_memory. */
1995 : 0 : sal_Unicode *just_past_start_mem = 0;
1996 : :
1997 : : /* Failure point stack. Each place that can handle a failure further
1998 : : down the line pushes a failure point on this stack. It consists of
1999 : : restart, regend, and reg_info for all registers corresponding to
2000 : : the subexpressions we're currently inside, plus the number of such
2001 : : registers, and, finally, two sal_Unicode *'s. The first
2002 : : sal_Unicode * is where to resume scanning the pattern; the second
2003 : : one is where to resume scanning the strings. If the latter is
2004 : : zero, the failure point is a ``dummy''; if a failure happens and
2005 : : the failure point is a dummy, it gets discarded and the next next
2006 : : one is tried. */
2007 : : #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
2008 : : fail_stack_type fail_stack;
2009 : : #endif
2010 : :
2011 : : /* We fill all the registers internally, independent of what we
2012 : : return, for use in backreferences. The number here includes
2013 : : an element for register zero. */
2014 : 0 : size_t num_regs = bufp->re_nsub + 1;
2015 : :
2016 : : /* The currently active registers. */
2017 : 0 : sal_uInt32 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
2018 : 0 : sal_uInt32 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
2019 : :
2020 : : /* Information on the contents of registers. These are pointers into
2021 : : the input strings; they record just what was matched (on this
2022 : : attempt) by a subexpression part of the pattern, that is, the
2023 : : regnum-th regstart pointer points to where in the pattern we began
2024 : : matching and the regnum-th regend points to right after where we
2025 : : stopped matching the regnum-th subexpression. (The zeroth register
2026 : : keeps track of what the whole pattern matches.) */
2027 : : #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
2028 : : const sal_Unicode **regstart, **regend;
2029 : : #endif
2030 : :
2031 : : /* If a group that's operated upon by a repetition operator fails to
2032 : : match anything, then the register for its start will need to be
2033 : : restored because it will have been set to wherever in the string we
2034 : : are when we last see its open-group operator. Similarly for a
2035 : : register's end. */
2036 : : #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
2037 : : const sal_Unicode **old_regstart, **old_regend;
2038 : : #endif
2039 : :
2040 : : /* The is_active field of reg_info helps us keep track of which (possibly
2041 : : nested) subexpressions we are currently in. The matched_something
2042 : : field of reg_info[reg_num] helps us tell whether or not we have
2043 : : matched any of the pattern so far this time through the reg_num-th
2044 : : subexpression. These two fields get reset each time through any
2045 : : loop their register is in. */
2046 : : #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
2047 : : register_info_type *reg_info;
2048 : : #endif
2049 : :
2050 : : /* The following record the register info as found in the above
2051 : : variables when we find a match better than any we've seen before.
2052 : : This happens as we backtrack through the failure points, which in
2053 : : turn happens only if we have not yet matched the entire string. */
2054 : : //unsigned best_regs_set = false;
2055 : : #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
2056 : : const sal_Unicode **best_regstart, **best_regend;
2057 : : #endif
2058 : :
2059 : : /* Logically, this is `best_regend[0]'. But we don't want to have to
2060 : : allocate space for that if we're not allocating space for anything
2061 : : else (see below). Also, we never need info about register 0 for
2062 : : any of the other register vectors, and it seems rather a kludge to
2063 : : treat `best_regend' differently than the rest. So we keep track of
2064 : : the end of the best match so far in a separate variable. We
2065 : : initialize this to NULL so that when we backtrack the first time
2066 : : and need to test it, it's not garbage. */
2067 : : //const sal_Unicode *match_end = NULL;
2068 : :
2069 : : /* This helps SET_REGS_MATCHED avoid doing redundant work. */
2070 : 0 : sal_Int32 set_regs_matched_done = 0;
2071 : :
2072 : : /* Used when we pop values we don't care about. */
2073 : : #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
2074 : : const sal_Unicode **reg_dummy;
2075 : : register_info_type *reg_info_dummy;
2076 : : #endif
2077 : :
2078 [ # # ]: 0 : INIT_FAIL_STACK();
2079 : :
2080 : : #ifdef MATCH_MAY_ALLOCATE
2081 : : /* Do not bother to initialize all the register variables if there are
2082 : : no groups in the pattern, as it takes a fair amount of time. If
2083 : : there are groups, we include space for register 0 (the whole
2084 : : pattern), even though we never use it, since it simplifies the
2085 : : array indexing. We should fix this. */
2086 [ # # ]: 0 : if (bufp->re_nsub)
2087 : : {
2088 : 0 : regstart = REGEX_TALLOC (num_regs, const sal_Unicode *);
2089 : 0 : regend = REGEX_TALLOC (num_regs, const sal_Unicode *);
2090 : 0 : old_regstart = REGEX_TALLOC (num_regs, const sal_Unicode *);
2091 : 0 : old_regend = REGEX_TALLOC (num_regs, const sal_Unicode *);
2092 : 0 : best_regstart = REGEX_TALLOC (num_regs, const sal_Unicode *);
2093 : 0 : best_regend = REGEX_TALLOC (num_regs, const sal_Unicode *);
2094 : 0 : reg_info = REGEX_TALLOC (num_regs, register_info_type);
2095 : 0 : reg_dummy = REGEX_TALLOC (num_regs, const sal_Unicode *);
2096 : 0 : reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
2097 : :
2098 [ # # ]: 0 : if (!(regstart && regend && old_regstart && old_regend && reg_info
2099 [ # # ][ # # ]: 0 : && best_regstart && best_regend && reg_dummy && reg_info_dummy))
[ # # ][ # # ]
[ # # ][ # # ]
[ # # ][ # # ]
2100 : : {
2101 : 0 : FREE_VARIABLES ();
2102 : 0 : return -2;
2103 : : }
2104 : : }
2105 : : else
2106 : : {
2107 : : /* We must initialize all our variables to NULL, so that
2108 : : `FREE_VARIABLES' doesn't try to free them. */
2109 : : regstart = regend = old_regstart = old_regend = best_regstart
2110 : 0 : = best_regend = reg_dummy = NULL;
2111 : 0 : reg_info = reg_info_dummy = (register_info_type *) NULL;
2112 : : }
2113 : : #endif /* MATCH_MAY_ALLOCATE */
2114 : :
2115 : 0 : sal_Unicode *string2 = (sal_Unicode *)line;
2116 : 0 : sal_Int32 size2 = linelen;
2117 : 0 : sal_Int32 stop = range;
2118 : :
2119 : : /* The starting position is bogus. */
2120 [ # # ][ # # ]: 0 : if (pos < 0 || pos >= size2 || linelen <= 0 ) {
[ # # ]
2121 : 0 : FREE_VARIABLES ();
2122 : 0 : return(-1);
2123 : : }
2124 : :
2125 : : /* Initialize subexpression text positions to -1 to mark ones that no
2126 : : start_memory/stop_memory has been seen for. Also initialize the
2127 : : register information struct. */
2128 [ # # ]: 0 : for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) {
2129 : 0 : regstart[mcnt] = regend[mcnt]
2130 : 0 : = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
2131 : :
2132 : 0 : REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
2133 : 0 : IS_ACTIVE (reg_info[mcnt]) = 0;
2134 : 0 : MATCHED_SOMETHING (reg_info[mcnt]) = 0;
2135 : 0 : EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
2136 : : }
2137 : :
2138 : 0 : end2 = (sal_Unicode *)(string2 + size2);
2139 : :
2140 : 0 : end_match_2 = (sal_Unicode *)(string2 + stop);
2141 : :
2142 : : /* `p' scans through the pattern as `d' scans through the data.
2143 : : `dend' is the end of the input string that `d' points within. `d'
2144 : : is advanced into the following input string whenever necessary, but
2145 : : this happens before fetching; therefore, at the beginning of the
2146 : : loop, `d' can be pointing at the end of a string, but it cannot
2147 : : equal `string2'. */
2148 : 0 : d = string2 + pos;
2149 : 0 : dend = end_match_2;
2150 : :
2151 : : /* This loops over pattern commands. It exits by returning from the
2152 : : function if the match is complete, or it drops through if the match
2153 : : fails at this starting point in the input data. */
2154 : 0 : for (;;) {
2155 [ # # ]: 0 : if (p == pend) {
2156 : : /* End of pattern means we might have succeeded. */
2157 : :
2158 : : /* If we haven't matched the entire string, and we want the
2159 : : longest match, try backtracking. */
2160 [ # # ]: 0 : if (d != end_match_2) {
2161 [ # # ]: 0 : if (!FAIL_STACK_EMPTY()) {
2162 : 0 : goto fail;
2163 : : }
2164 : : } /* d != end_match_2 */
2165 : :
2166 : : succeed_label:
2167 : :
2168 : : /* If caller wants register contents data back, do it. */
2169 [ # # ]: 0 : if (regs) {
2170 : : /* Have the register data arrays been allocated? */
2171 [ # # ]: 0 : if (regs->num_regs == 0) {
2172 : : /* No. So allocate them with malloc. We need one
2173 : : extra element beyond `num_regs' for the `-1' marker
2174 : : GNU code uses. */
2175 : 0 : regs->num_of_match = 0;
2176 : 0 : regs->num_regs = MAX(RE_NREGS, num_regs + 1);
2177 : 0 : regs->start = (sal_Int32 *) malloc(regs->num_regs * sizeof(sal_Int32));
2178 : 0 : regs->end = (sal_Int32 *) malloc(regs->num_regs * sizeof(sal_Int32));
2179 [ # # ][ # # ]: 0 : if (regs->start == NULL || regs->end == NULL) {
2180 : 0 : FREE_VARIABLES ();
2181 : 0 : return(-2);
2182 : : }
2183 [ # # ]: 0 : } else if ( regs->num_regs > 0 ) {
2184 : : /* Yes. If we need more elements than were already
2185 : : allocated, reallocate them. If we need fewer, just
2186 : : leave it alone. */
2187 [ # # ]: 0 : if (regs->num_regs < num_regs + 1) {
2188 : 0 : regs->num_regs = num_regs + 1;
2189 : 0 : regs->start = (sal_Int32 *) realloc(regs->start, regs->num_regs * sizeof(sal_Int32));
2190 : 0 : regs->end = (sal_Int32 *) realloc(regs->end, regs->num_regs * sizeof(sal_Int32));
2191 [ # # ][ # # ]: 0 : if (regs->start == NULL || regs->end == NULL) {
2192 : 0 : FREE_VARIABLES ();
2193 : 0 : return(-2);
2194 : : }
2195 : : }
2196 : : } else { // num_regs is negative
2197 : 0 : FREE_VARIABLES ();
2198 : 0 : return(-2);
2199 : : }
2200 : :
2201 : : /* Convert the pointer data in `regstart' and `regend' to
2202 : : indices. Register zero has to be set differently,
2203 : : since we haven't kept track of any info for it. */
2204 [ # # ]: 0 : if (regs->num_regs > 0) {
2205 : : // Make sure a valid location
2206 : 0 : sal_Int32 dpos = d - string2;
2207 [ # # ][ # # ]: 0 : if (pos == dpos || (d - 1) >= dend ) {
2208 : 0 : FREE_VARIABLES ();
2209 : 0 : return(-1);
2210 : : }
2211 : 0 : regs->start[regs->num_of_match] = pos;
2212 : 0 : regs->end[regs->num_of_match] = ((sal_Int32) (d - string2));
2213 : 0 : regs->num_of_match++;
2214 : : }
2215 : :
2216 : : /* Go through the first `min (num_regs, regs->num_regs)'
2217 : : registers, since that is all we initialized. */
2218 [ # # ][ # # ]: 0 : for (mcnt = regs->num_of_match; (unsigned) mcnt < MIN(num_regs, regs->num_regs);
2219 : : mcnt++) {
2220 : 0 : regs->start[mcnt] = regs->end[mcnt] = -1;
2221 [ # # ][ # # ]: 0 : if( !(REG_UNSET(regstart[mcnt]) || REG_UNSET(regend[mcnt])) ) {
2222 : 0 : regs->start[regs->num_of_match] = (sal_Int32) POINTER_TO_OFFSET(regstart[mcnt]);
2223 : 0 : regs->end[regs->num_of_match] = (sal_Int32) POINTER_TO_OFFSET(regend[mcnt]);
2224 : 0 : regs->num_of_match++;
2225 : : }
2226 : : }
2227 : :
2228 : : /* If the regs structure we return has more elements than
2229 : : were in the pattern, set the extra elements to -1. If
2230 : : we (re)allocated the registers, this is the case,
2231 : : because we always allocate enough to have at least one
2232 : : -1 at the end. */
2233 [ # # ]: 0 : for (mcnt = regs->num_of_match; (unsigned) mcnt < regs->num_regs; mcnt++)
2234 : 0 : regs->start[mcnt] = regs->end[mcnt] = -1;
2235 : : } /* regs */
2236 : :
2237 : 0 : mcnt = d - pos - string2;
2238 : :
2239 : 0 : FREE_VARIABLES ();
2240 : 0 : return(0);
2241 : : }
2242 : : /* Otherwise match next pattern command. */
2243 [ # # # # : 0 : switch ((re_opcode_t) *p++) {
# # # # #
# # # # #
# # # # #
# # # # #
# # ]
2244 : : /* Ignore these. Used to ignore the n of succeed_n's which
2245 : : currently have n == 0. */
2246 : : case no_op:
2247 : 0 : break;
2248 : :
2249 : : case succeed:
2250 : 0 : goto succeed_label;
2251 : :
2252 : : /* Match the next n pattern characters exactly. The following
2253 : : byte in the pattern defines n, and the n bytes after that
2254 : : are the characters to match. */
2255 : : case exactn:
2256 : 0 : mcnt = *p++;
2257 : :
2258 [ # # ]: 0 : do {
2259 [ # # ]: 0 : PREFETCH();
2260 [ # # ]: 0 : if ((sal_Unicode)*d++ != (sal_Unicode) *p++) goto fail;
2261 : : } while (--mcnt);
2262 [ # # ][ # # ]: 0 : SET_REGS_MATCHED();
2263 : 0 : break;
2264 : :
2265 : : /* Match any character except possibly a newline or a null. */
2266 : : case anychar:
2267 : :
2268 [ # # ]: 0 : PREFETCH();
2269 [ # # ][ # # ]: 0 : if ( *d == (sal_Unicode)'\n' ||
2270 : : *d == (sal_Unicode)'\000' )
2271 : : goto fail;
2272 : :
2273 [ # # ][ # # ]: 0 : SET_REGS_MATCHED();
2274 : 0 : d++;
2275 : 0 : break;
2276 : :
2277 : : case charset:
2278 : : case charset_not: {
2279 : : register sal_Unicode c;
2280 : 0 : sal_Bool knot = (re_opcode_t) *(p - 1) == charset_not;
2281 : :
2282 [ # # ]: 0 : PREFETCH();
2283 : 0 : c = *d; /* The character to match. */
2284 : : /* Cast to `sal_uInt32' instead of `sal_Unicode' in case the
2285 : : bit list is a full 32 bytes long. */
2286 [ # # ][ # # ]: 0 : if ((c < (sal_uInt32) (*p * BYTEWIDTH)) && (p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))))
2287 : 0 : knot = !knot;
2288 : :
2289 : 0 : p += 1 + *p;
2290 : :
2291 [ # # ]: 0 : if (!knot) {
2292 : 0 : goto fail;
2293 : : }
2294 : :
2295 [ # # ][ # # ]: 0 : SET_REGS_MATCHED();
2296 : 0 : d++;
2297 : 0 : break;
2298 : : }
2299 : :
2300 : : /* The beginning of a group is represented by start_memory.
2301 : : The arguments are the register number in the next byte, and the
2302 : : number of groups inner to this one in the next. The text
2303 : : matched within the group is recorded (in the internal
2304 : : registers data structure) under the register number. */
2305 : : case start_memory:
2306 : :
2307 : : /* Find out if this group can match the empty string. */
2308 : 0 : p1 = p; /* To send to group_match_null_string_p. */
2309 : :
2310 [ # # ]: 0 : if (REG_MATCH_NULL_STRING_P(reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
2311 [ # # ]: 0 : REG_MATCH_NULL_STRING_P(reg_info[*p]) = group_match_null_string_p(&p1, pend, reg_info);
2312 : :
2313 : : /* Save the position in the string where we were the last time
2314 : : we were at this open-group operator in case the group is
2315 : : operated upon by a repetition operator, e.g., with `(a*)*b'
2316 : : against `ab'; then we want to ignore where we are now in
2317 : : the string in case this attempt to match fails. */
2318 : 0 : old_regstart[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
2319 : 0 : ? REG_UNSET(regstart[*p]) ? d : regstart[*p]
2320 [ # # ][ # # ]: 0 : : regstart[*p];
2321 : :
2322 : 0 : regstart[*p] = d;
2323 : :
2324 : 0 : IS_ACTIVE (reg_info[*p]) = 1;
2325 : 0 : MATCHED_SOMETHING(reg_info[*p]) = 0;
2326 : :
2327 : : /* Clear this whenever we change the register activity status. */
2328 : 0 : set_regs_matched_done = 0;
2329 : :
2330 : : /* This is the new highest active register. */
2331 : 0 : highest_active_reg = *p;
2332 : :
2333 : : /* If nothing was active before, this is the new lowest active
2334 : : register. */
2335 [ # # ]: 0 : if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
2336 : 0 : lowest_active_reg = *p;
2337 : :
2338 : : /* Move past the register number and inner group count. */
2339 : 0 : p += 2;
2340 : 0 : just_past_start_mem = p;
2341 : :
2342 : 0 : break;
2343 : :
2344 : : /* The stop_memory opcode represents the end of a group. Its
2345 : : arguments are the same as start_memory's: the register
2346 : : number, and the number of inner groups. */
2347 : : case stop_memory:
2348 : :
2349 : : /* We need to save the string position the last time we were at
2350 : : this close-group operator in case the group is operated
2351 : : upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
2352 : : against `aba'; then we want to ignore where we are now in
2353 : : the string in case this attempt to match fails. */
2354 : 0 : old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
2355 : 0 : ? REG_UNSET(regend[*p]) ? d : regend[*p]
2356 [ # # ][ # # ]: 0 : : regend[*p];
2357 : :
2358 : 0 : regend[*p] = d;
2359 : :
2360 : : /* This register isn't active anymore. */
2361 : 0 : IS_ACTIVE(reg_info[*p]) = 0;
2362 : :
2363 : : /* Clear this whenever we change the register activity status. */
2364 : 0 : set_regs_matched_done = 0;
2365 : :
2366 : : /* If this was the only register active, nothing is active
2367 : : anymore. */
2368 [ # # ]: 0 : if (lowest_active_reg == highest_active_reg) {
2369 : 0 : lowest_active_reg = NO_LOWEST_ACTIVE_REG;
2370 : 0 : highest_active_reg = NO_HIGHEST_ACTIVE_REG;
2371 : : } else { /* We must scan for the new highest active register, since
2372 : : it isn't necessarily one less than now: consider
2373 : : (a(b)c(d(e)f)g). When group 3 ends, after the f), the
2374 : : new highest active register is 1. */
2375 : 0 : sal_Unicode r = *p - 1;
2376 [ # # ][ # # ]: 0 : while (r > 0 && !IS_ACTIVE (reg_info[r]))
[ # # ]
2377 : 0 : r--;
2378 : :
2379 : : /* If we end up at register zero, that means that we saved
2380 : : the registers as the result of an `on_failure_jump', not
2381 : : a `start_memory', and we jumped to past the innermost
2382 : : `stop_memory'. For example, in ((.)*) we save
2383 : : registers 1 and 2 as a result of the *, but when we pop
2384 : : back to the second ), we are at the stop_memory 1.
2385 : : Thus, nothing is active. */
2386 [ # # ]: 0 : if (r == 0) {
2387 : 0 : lowest_active_reg = NO_LOWEST_ACTIVE_REG;
2388 : 0 : highest_active_reg = NO_HIGHEST_ACTIVE_REG;
2389 : : } else
2390 : 0 : highest_active_reg = r;
2391 : : }
2392 : :
2393 : : /* If just failed to match something this time around with a
2394 : : group that's operated on by a repetition operator, try to
2395 : : force exit from the ``loop'', and restore the register
2396 : : information for this group that we had before trying this
2397 : : last match. */
2398 [ # # ][ # # ]: 0 : if ((!MATCHED_SOMETHING (reg_info[*p])
[ # # ]
2399 : 0 : || just_past_start_mem == p - 1)
2400 : 0 : && (p + 2) < pend) {
2401 : 0 : sal_Bool is_a_jump_n = false;
2402 : :
2403 : 0 : p1 = p + 2;
2404 : 0 : mcnt = 0;
2405 [ # # # ]: 0 : switch ((re_opcode_t) *p1++) {
2406 : : case jump_n:
2407 : 0 : is_a_jump_n = true;
2408 : : case pop_failure_jump:
2409 : : case maybe_pop_jump:
2410 : : case jump:
2411 : : case dummy_failure_jump:
2412 : 0 : extract_number_and_incr(mcnt, p1);
2413 [ # # ]: 0 : if (is_a_jump_n)
2414 : 0 : p1 += 2;
2415 : 0 : break;
2416 : :
2417 : : default:
2418 : : /* do nothing */ ;
2419 : : }
2420 : 0 : p1 += mcnt;
2421 : :
2422 : : /* If the next operation is a jump backwards in the pattern
2423 : : to an on_failure_jump right before the start_memory
2424 : : corresponding to this stop_memory, exit from the loop
2425 : : by forcing a failure after pushing on the stack the
2426 : : on_failure_jump's jump in the pattern, and d. */
2427 [ # # ][ # # ]: 0 : if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
[ # # ][ # # ]
2428 : 0 : && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) {
2429 : : /* If this group ever matched anything, then restore
2430 : : what its registers were before trying this last
2431 : : failed match, e.g., with `(a*)*b' against `ab' for
2432 : : regstart[1], and, e.g., with `((a*)*(b*)*)*'
2433 : : against `aba' for regend[3].
2434 : :
2435 : : Also restore the registers for inner groups for,
2436 : : e.g., `((a*)(b*))*' against `aba' (register 3 would
2437 : : otherwise get trashed). */
2438 : :
2439 [ # # ]: 0 : if (EVER_MATCHED_SOMETHING (reg_info[*p])) {
2440 : : unsigned r;
2441 : :
2442 : 0 : EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
2443 : :
2444 : : /* Restore this and inner groups' (if any) registers. */
2445 [ # # ]: 0 : for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
2446 : : r++) {
2447 : 0 : regstart[r] = old_regstart[r];
2448 : :
2449 : : /* xx why this test? */
2450 [ # # ]: 0 : if (old_regend[r] >= regstart[r])
2451 : 0 : regend[r] = old_regend[r];
2452 : : }
2453 : : }
2454 : 0 : p1++;
2455 : 0 : extract_number_and_incr(mcnt, p1);
2456 [ # # ][ # # ]: 0 : PUSH_FAILURE_POINT(p1 + mcnt, d, -2);
[ # # ][ # # ]
[ # # ]
2457 : :
2458 : 0 : goto fail;
2459 : : }
2460 : : }
2461 : :
2462 : : /* Move past the register number and the inner group count. */
2463 : 0 : p += 2;
2464 : 0 : break;
2465 : :
2466 : :
2467 : : /* \<digit> has been turned into a `duplicate' command which is
2468 : : followed by the numeric value of <digit> as the register number. */
2469 : : case duplicate:
2470 : : {
2471 : : register const sal_Unicode *d2, *dend2;
2472 : 0 : sal_Unicode regno = *p++; /* Get which register to match against. */
2473 : :
2474 : : /* Can't back reference a group which we've never matched. */
2475 [ # # ][ # # ]: 0 : if (REG_UNSET(regstart[regno]) || REG_UNSET(regend[regno])) {
2476 : : goto fail;
2477 : : }
2478 : :
2479 : : /* Where in input to try to start matching. */
2480 : 0 : d2 = regstart[regno];
2481 : :
2482 : : /* Where to stop matching; if both the place to start and
2483 : : the place to stop matching are in the same string, then
2484 : : set to the place to stop, otherwise, for now have to use
2485 : : the end of the first string. */
2486 : :
2487 : 0 : dend2 = regend[regno];
2488 : 0 : for (;;) {
2489 : : /* If necessary, advance to next segment in register
2490 : : contents. */
2491 [ # # ]: 0 : while (d2 == dend2) {
2492 [ # # ]: 0 : if (dend2 == end_match_2) break;
2493 [ # # ]: 0 : if (dend2 == regend[regno]) break;
2494 : : }
2495 : : /* At end of register contents => success */
2496 [ # # ]: 0 : if (d2 == dend2) break;
2497 : :
2498 [ # # ]: 0 : PREFETCH();
2499 : :
2500 : : /* How many characters left in this segment to match. */
2501 : 0 : mcnt = dend - d;
2502 : :
2503 : : /* Want how many consecutive characters we can match in
2504 : : one shot, so, if necessary, adjust the count. */
2505 [ # # ]: 0 : if (mcnt > dend2 - d2)
2506 : 0 : mcnt = dend2 - d2;
2507 : :
2508 : : /* Compare that many; failure if mismatch, else move
2509 : : past them. */
2510 [ # # ]: 0 : if (translate
2511 [ # # ]: 0 : ? bcmp_translate(d, d2, mcnt)
2512 [ # # ]: 0 : : memcmp(d, d2, mcnt * sizeof(sal_Unicode))) {
2513 : 0 : goto fail;
2514 : : }
2515 : 0 : d += mcnt, d2 += mcnt;
2516 : : /* Do this because we've match some characters. */
2517 [ # # ][ # # ]: 0 : SET_REGS_MATCHED();
2518 : : }
2519 : : }
2520 : 0 : break;
2521 : :
2522 : : /* begline matches the empty string at the beginning of the string
2523 : : (unless `not_bol' is set in `bufp'), and, if
2524 : : `newline_anchor' is set, after newlines. */
2525 : : case begline:
2526 : :
2527 [ # # ][ # # ]: 0 : if (AT_STRINGS_BEG (d)) {
2528 [ # # ]: 0 : if (!bufp->not_bol) break;
2529 [ # # ][ # # ]: 0 : } else if (d[-1] == '\n' && bufp->newline_anchor) {
2530 : 0 : break;
2531 : : }
2532 : : /* In all other cases, we fail. */
2533 : 0 : goto fail;
2534 : :
2535 : : /* endline is the dual of begline. */
2536 : : case endline:
2537 : :
2538 [ # # ]: 0 : if (AT_STRINGS_END(d)) {
2539 [ # # ]: 0 : if (!bufp->not_eol) break;
2540 [ # # ][ # # ]: 0 : } else if (*d == '\n' && bufp->newline_anchor) {
2541 : 0 : break;
2542 : : }
2543 : 0 : goto fail;
2544 : :
2545 : : /* Match at the very beginning of the data. */
2546 : : case begbuf:
2547 [ # # ][ # # ]: 0 : if (AT_STRINGS_BEG (d))
2548 : 0 : break;
2549 : 0 : goto fail;
2550 : :
2551 : :
2552 : : /* Match at the very end of the data. */
2553 : : case endbuf:
2554 [ # # ]: 0 : if (AT_STRINGS_END (d))
2555 : 0 : break;
2556 : 0 : goto fail;
2557 : :
2558 : :
2559 : : /* on_failure_keep_string_jump is used to optimize `.*\n'. It
2560 : : pushes NULL as the value for the string on the stack. Then
2561 : : `pop_failure_point' will keep the current value for the
2562 : : string, instead of restoring it. To see why, consider
2563 : : matching `foo\nbar' against `.*\n'. The .* matches the foo;
2564 : : then the . fails against the \n. But the next thing we want
2565 : : to do is match the \n against the \n; if we restored the
2566 : : string value, we would be back at the foo.
2567 : :
2568 : : Because this is used only in specific cases, we don't need to
2569 : : check all the things that `on_failure_jump' does, to make
2570 : : sure the right things get saved on the stack. Hence we don't
2571 : : share its code. The only reason to push anything on the
2572 : : stack at all is that otherwise we would have to change
2573 : : `anychar's code to do something besides goto fail in this
2574 : : case; that seems worse than this. */
2575 : : case on_failure_keep_string_jump:
2576 : :
2577 : 0 : extract_number_and_incr(mcnt, p);
2578 : :
2579 [ # # ][ # # ]: 0 : PUSH_FAILURE_POINT(p + mcnt, NULL, -2);
[ # # ][ # # ]
[ # # ]
2580 : 0 : break;
2581 : :
2582 : :
2583 : : /* Uses of on_failure_jump:
2584 : :
2585 : : Each alternative starts with an on_failure_jump that points
2586 : : to the beginning of the next alternative. Each alternative
2587 : : except the last ends with a jump that in effect jumps past
2588 : : the rest of the alternatives. (They really jump to the
2589 : : ending jump of the following alternative, because tensioning
2590 : : these jumps is a hassle.)
2591 : :
2592 : : Repeats start with an on_failure_jump that points past both
2593 : : the repetition text and either the following jump or
2594 : : pop_failure_jump back to this on_failure_jump. */
2595 : : case on_failure_jump:
2596 : : on_failure:
2597 : :
2598 : 0 : extract_number_and_incr(mcnt, p);
2599 : :
2600 : : /* If this on_failure_jump comes right before a group (i.e.,
2601 : : the original * applied to a group), save the information
2602 : : for that group and all inner ones, so that if we fail back
2603 : : to this point, the group's information will be correct.
2604 : : For example, in \(a*\)*\1, we need the preceding group,
2605 : : and in \(zz\(a*\)b*\)\2, we need the inner group. */
2606 : :
2607 : : /* We can't use `p' to check ahead because we push
2608 : : a failure point to `p + mcnt' after we do this. */
2609 : 0 : p1 = p;
2610 : :
2611 : : /* We need to skip no_op's before we look for the
2612 : : start_memory in case this on_failure_jump is happening as
2613 : : the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
2614 : : against aba. */
2615 [ # # ][ # # ]: 0 : while (p1 < pend && (re_opcode_t) *p1 == no_op)
[ # # ]
2616 : 0 : p1++;
2617 : :
2618 [ # # ][ # # ]: 0 : if (p1 < pend && (re_opcode_t) *p1 == start_memory) {
2619 : : /* We have a new highest active register now. This will
2620 : : get reset at the start_memory we are about to get to,
2621 : : but we will have saved all the registers relevant to
2622 : : this repetition op, as described above. */
2623 : 0 : highest_active_reg = *(p1 + 1) + *(p1 + 2);
2624 [ # # ]: 0 : if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
2625 : 0 : lowest_active_reg = *(p1 + 1);
2626 : : }
2627 : :
2628 [ # # ][ # # ]: 0 : PUSH_FAILURE_POINT(p + mcnt, d, -2);
[ # # ][ # # ]
[ # # ]
2629 : 0 : break;
2630 : :
2631 : : /* A smart repeat ends with `maybe_pop_jump'.
2632 : : We change it to either `pop_failure_jump' or `jump'. */
2633 : : case maybe_pop_jump:
2634 : 0 : extract_number_and_incr(mcnt, p);
2635 : : {
2636 : 0 : register sal_Unicode *p2 = p;
2637 : :
2638 : : /* Compare the beginning of the repeat with what in the
2639 : : pattern follows its end. If we can establish that there
2640 : : is nothing that they would both match, i.e., that we
2641 : : would have to backtrack because of (as in, e.g., `a*a')
2642 : : then we can change to pop_failure_jump, because we'll
2643 : : never have to backtrack.
2644 : :
2645 : : This is not true in the case of alternatives: in
2646 : : `(a|ab)*' we do need to backtrack to the `ab' alternative
2647 : : (e.g., if the string was `ab'). But instead of trying to
2648 : : detect that here, the alternative has put on a dummy
2649 : : failure point which is what we will end up popping. */
2650 : :
2651 : : /* Skip over open/close-group commands.
2652 : : If what follows this loop is a ...+ construct,
2653 : : look at what begins its body, since we will have to
2654 : : match at least one of that. */
2655 : 0 : while (1) {
2656 [ # # ][ # # ]: 0 : if (p2 + 2 < pend
[ # # ]
2657 : : && ((re_opcode_t) *p2 == stop_memory
2658 : : || (re_opcode_t) *p2 == start_memory))
2659 : 0 : p2 += 3;
2660 [ # # ][ # # ]: 0 : else if (p2 + 6 < pend
2661 : : && (re_opcode_t) *p2 == dummy_failure_jump)
2662 : 0 : p2 += 6;
2663 : : else
2664 : 0 : break;
2665 : : }
2666 : :
2667 : 0 : p1 = p + mcnt;
2668 : : /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
2669 : : to the `maybe_finalize_jump' of this case. Examine what
2670 : : follows. */
2671 : :
2672 : : /* If we're at the end of the pattern, we can change. */
2673 [ # # ]: 0 : if (p2 == pend) {
2674 : : /* Consider what happens when matching ":\(.*\)"
2675 : : against ":/". I don't really understand this code
2676 : : yet. */
2677 : 0 : p[-3] = (sal_Unicode) pop_failure_jump;
2678 [ # # ][ # # ]: 0 : } else if ((re_opcode_t) *p2 == exactn
[ # # ]
2679 : : || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) {
2680 [ # # ]: 0 : register sal_Unicode c = *p2 == (sal_Unicode) endline ? (sal_Unicode)'\n' : p2[2];
2681 : :
2682 [ # # ][ # # ]: 0 : if ((re_opcode_t) p1[3] == exactn && p1[5] != c) {
2683 : 0 : p[-3] = (sal_Unicode) pop_failure_jump;
2684 [ # # ][ # # ]: 0 : } else if ((re_opcode_t) p1[3] == charset
2685 : 0 : || (re_opcode_t) p1[3] == charset_not) {
2686 : 0 : sal_Int32 knot = (re_opcode_t) p1[3] == charset_not;
2687 : :
2688 [ # # ][ # # ]: 0 : if (c < (sal_Unicode) (p1[4] * BYTEWIDTH)
2689 : 0 : && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
2690 : 0 : knot = !knot;
2691 : :
2692 : : /* `not' is equal to 1 if c would match, which means
2693 : : that we can't change to pop_failure_jump. */
2694 [ # # ]: 0 : if (!knot) {
2695 : 0 : p[-3] = (unsigned char) pop_failure_jump;
2696 : : }
2697 : 0 : }
2698 [ # # ]: 0 : } else if ((re_opcode_t) *p2 == charset) {
2699 : : /* We win if the first character of the loop is not part
2700 : : of the charset. */
2701 [ # # ][ # # ]: 0 : if ((re_opcode_t) p1[3] == exactn
2702 : 0 : && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
2703 : 0 : && (p2[2 + p1[5] / BYTEWIDTH]
2704 [ # # ]: 0 : & (1 << (p1[5] % BYTEWIDTH))))) {
2705 : 0 : p[-3] = (sal_Unicode) pop_failure_jump;
2706 [ # # ]: 0 : } else if ((re_opcode_t) p1[3] == charset_not) {
2707 : : sal_Int32 idx;
2708 : : /* We win if the charset_not inside the loop
2709 : : lists every character listed in the charset after. */
2710 [ # # ]: 0 : for (idx = 0; idx < (int) p2[1]; idx++)
2711 [ # # ]: 0 : if (! (p2[2 + idx] == 0
2712 : 0 : || (idx < (int) p1[4]
2713 [ # # ][ # # ]: 0 : && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
2714 : 0 : break;
2715 : :
2716 [ # # ]: 0 : if (idx == p2[1]) {
2717 : 0 : p[-3] = (sal_Unicode) pop_failure_jump;
2718 : : }
2719 [ # # ]: 0 : } else if ((re_opcode_t) p1[3] == charset) {
2720 : : sal_Int32 idx;
2721 : : /* We win if the charset inside the loop
2722 : : has no overlap with the one after the loop. */
2723 [ # # ][ # # ]: 0 : for (idx = 0;
[ # # ]
2724 : 0 : idx < (sal_Int32) p2[1] && idx < (sal_Int32) p1[4];
2725 : : idx++)
2726 [ # # ]: 0 : if ((p2[2 + idx] & p1[5 + idx]) != 0)
2727 : 0 : break;
2728 : :
2729 [ # # ][ # # ]: 0 : if (idx == p2[1] || idx == p1[4]) {
2730 : 0 : p[-3] = (sal_Unicode) pop_failure_jump;
2731 : : }
2732 : : }
2733 : : }
2734 : : }
2735 : 0 : p -= 2; /* Point at relative address again. */
2736 [ # # ]: 0 : if ((re_opcode_t) p[-1] != pop_failure_jump) {
2737 : 0 : p[-1] = (sal_Unicode) jump;
2738 : 0 : goto unconditional_jump;
2739 : : }
2740 : : /* Note fall through. */
2741 : :
2742 : :
2743 : : /* The end of a simple repeat has a pop_failure_jump back to
2744 : : its matching on_failure_jump, where the latter will push a
2745 : : failure point. The pop_failure_jump takes off failure
2746 : : points put on by this pop_failure_jump's matching
2747 : : on_failure_jump; we got through the pattern to here from the
2748 : : matching on_failure_jump, so didn't fail. */
2749 : : case pop_failure_jump:
2750 : : {
2751 : : /* We need to pass separate storage for the lowest and
2752 : : highest registers, even though we don't care about the
2753 : : actual values. Otherwise, we will restore only one
2754 : : register from the stack, since lowest will == highest in
2755 : : `pop_failure_point'. */
2756 : : sal_uInt32 dummy_low_reg, dummy_high_reg;
2757 : 0 : sal_Unicode *pdummy = NULL;
2758 : 0 : const sal_Unicode *sdummy = NULL;
2759 : :
2760 [ # # ][ # # ]: 0 : POP_FAILURE_POINT(sdummy, pdummy,
2761 : : dummy_low_reg, dummy_high_reg,
2762 : : reg_dummy, reg_dummy, reg_info_dummy);
2763 : :
2764 : : (void)sdummy;
2765 : : (void)pdummy;
2766 : : }
2767 : : /* Note fall through. */
2768 : :
2769 : : unconditional_jump:
2770 : : /* Note fall through. */
2771 : :
2772 : : /* Unconditionally jump (without popping any failure points). */
2773 : : case jump:
2774 : 0 : extract_number_and_incr(mcnt, p); /* Get the amount to jump. */
2775 : 0 : p += mcnt; /* Do the jump. */
2776 : 0 : break;
2777 : :
2778 : : /* We need this opcode so we can detect where alternatives end
2779 : : in `group_match_null_string_p' et al. */
2780 : : case jump_past_alt:
2781 : 0 : goto unconditional_jump;
2782 : :
2783 : :
2784 : : /* Normally, the on_failure_jump pushes a failure point, which
2785 : : then gets popped at pop_failure_jump. We will end up at
2786 : : pop_failure_jump, also, and with a pattern of, say, `a+', we
2787 : : are skipping over the on_failure_jump, so we have to push
2788 : : something meaningless for pop_failure_jump to pop. */
2789 : : case dummy_failure_jump:
2790 : : /* It doesn't matter what we push for the string here. What
2791 : : the code at `fail' tests is the value for the pattern. */
2792 [ # # ][ # # ]: 0 : PUSH_FAILURE_POINT(NULL, NULL, -2);
[ # # ][ # # ]
[ # # ]
2793 : 0 : goto unconditional_jump;
2794 : :
2795 : :
2796 : : /* At the end of an alternative, we need to push a dummy failure
2797 : : point in case we are followed by a `pop_failure_jump', because
2798 : : we don't want the failure point for the alternative to be
2799 : : popped. For example, matching `(a|ab)*' against `aab'
2800 : : requires that we match the `ab' alternative. */
2801 : : case push_dummy_failure:
2802 : : /* See comments just above at `dummy_failure_jump' about the
2803 : : two zeroes. */
2804 [ # # ][ # # ]: 0 : PUSH_FAILURE_POINT(NULL, NULL, -2);
[ # # ][ # # ]
[ # # ]
2805 : 0 : break;
2806 : :
2807 : : /* Have to succeed matching what follows at least n times.
2808 : : After that, handle like `on_failure_jump'. */
2809 : : case succeed_n:
2810 : 0 : extract_number(mcnt, p + 2);
2811 : :
2812 : : assert (mcnt >= 0);
2813 : : /* Originally, this is how many times we HAVE to succeed. */
2814 [ # # ]: 0 : if (mcnt > 0) {
2815 : 0 : mcnt--;
2816 : 0 : p += 2;
2817 : 0 : store_number_and_incr (p, mcnt);
2818 [ # # ]: 0 : } else if (mcnt == 0) {
2819 : 0 : p[2] = (sal_Unicode) no_op;
2820 : 0 : p[3] = (sal_Unicode) no_op;
2821 : 0 : goto on_failure;
2822 : : }
2823 : 0 : break;
2824 : :
2825 : : case jump_n:
2826 : 0 : extract_number(mcnt, p + 2);
2827 : :
2828 : : /* Originally, this is how many times we CAN jump. */
2829 [ # # ]: 0 : if (mcnt) {
2830 : 0 : mcnt--;
2831 : 0 : store_number (p + 2, mcnt);
2832 : 0 : goto unconditional_jump;
2833 : : }
2834 : : /* If don't have to jump any more, skip over the rest of command. */
2835 : : else
2836 : 0 : p += 4;
2837 : 0 : break;
2838 : :
2839 : : case set_number_at:
2840 : : {
2841 : :
2842 : 0 : extract_number_and_incr(mcnt, p);
2843 : 0 : p1 = p + mcnt;
2844 : 0 : extract_number_and_incr(mcnt, p);
2845 : 0 : store_number (p1, mcnt);
2846 : 0 : break;
2847 : : }
2848 : :
2849 : : case wordbeg:
2850 [ # # ][ # # ]: 0 : if (iswordbegin(d, string2, size2))
2851 : 0 : break;
2852 : 0 : goto fail;
2853 : :
2854 : : case wordend:
2855 [ # # ][ # # ]: 0 : if (iswordend(d, string2, size2))
2856 : 0 : break;
2857 : 0 : goto fail;
2858 : :
2859 : :
2860 : : default:
2861 : 0 : abort();
2862 : : }
2863 : 0 : continue; /* Successfully executed one pattern command; keep going. */
2864 : :
2865 : : /* We goto here if a matching operation fails. */
2866 : : fail:
2867 [ # # ]: 0 : if (!FAIL_STACK_EMPTY()) {
2868 : : /* A restart point is known. Restore to that state. */
2869 [ # # ][ # # ]: 0 : POP_FAILURE_POINT(d, p,
2870 : : lowest_active_reg, highest_active_reg,
2871 : : regstart, regend, reg_info);
2872 : :
2873 : : /* If this failure point is a dummy, try the next one. */
2874 [ # # ]: 0 : if (!p)
2875 : 0 : goto fail;
2876 : :
2877 : : /* If we failed to the end of the pattern, don't examine *p. */
2878 : : assert(p <= pend);
2879 [ # # ]: 0 : if (p < pend) {
2880 : 0 : sal_Bool is_a_jump_n = false;
2881 : :
2882 : : /* If failed to a backwards jump that's part of a repetition
2883 : : loop, need to pop this failure point and use the next
2884 : : one. */
2885 [ # # # ]: 0 : switch ((re_opcode_t) *p) {
2886 : : case jump_n:
2887 : 0 : is_a_jump_n = true;
2888 : : case maybe_pop_jump:
2889 : : case pop_failure_jump:
2890 : : case jump:
2891 : 0 : p1 = p + 1;
2892 : 0 : extract_number_and_incr(mcnt, p1);
2893 : 0 : p1 += mcnt;
2894 : :
2895 [ # # ][ # # ]: 0 : if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
[ # # ][ # # ]
2896 : : || (!is_a_jump_n
2897 : : && (re_opcode_t) *p1 == on_failure_jump)) {
2898 : : goto fail;
2899 : : }
2900 : 0 : break;
2901 : : default:
2902 : : /* do nothing */ ;
2903 : : }
2904 : : }
2905 : :
2906 : : } else {
2907 : 0 : break; /* Matching at this starting point really fails. */
2908 : : }
2909 : : } /* for (;;) */
2910 : :
2911 : 0 : FREE_VARIABLES ();
2912 : :
2913 : 0 : return(-1); /* Failure to match. */
2914 : : } /* re_match2 */
2915 : :
2916 : : /* Set the bit for character C in a list. */
2917 : : void
2918 : 0 : Regexpr::set_list_bit(sal_Unicode c, sal_Unicode *b)
2919 : : {
2920 [ # # ]: 0 : if ( translate ) {
2921 : : try {
2922 [ # # ][ # # ]: 0 : sal_Unicode tmp = translit->transliterateChar2Char(c);
2923 : 0 : b[tmp / BYTEWIDTH] |= 1 << (tmp % BYTEWIDTH);
2924 [ # # ]: 0 : } catch (const ::com::sun::star::i18n::MultipleCharsOutputException&) {
2925 [ # # # # ]: 0 : ::rtl::OUString o2( translit->transliterateChar2String( c));
2926 : 0 : sal_Int32 len2 = o2.getLength();
2927 : 0 : const sal_Unicode * k2 = o2.getStr();
2928 [ # # ]: 0 : for (sal_Int32 nmatch = 0; nmatch < len2; nmatch++) {
2929 : 0 : b[k2[nmatch] / BYTEWIDTH] |= 1 << (k2[nmatch] % BYTEWIDTH);
2930 : 0 : }
2931 : : }
2932 : : } else {
2933 : 0 : b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH);
2934 : : }
2935 : 0 : }
2936 : :
2937 : : /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|