modules/up/src/gnug++/regex.c
/* [<][>][^][v][top][bottom][index][help] */
FUNCTIONS
This source file includes following functions.
- bcmp
- bcopy
- bzero
- init_syntax_once
- SYNTAX
- isascii
- ISBLANK
- ISBLANK
- ISGRAPH
- ISGRAPH
- ISPRINT
- ISDIGIT
- ISALNUM
- ISALPHA
- ISCNTRL
- ISLOWER
- ISPUNCT
- ISSPACE
- ISUPPER
- ISXDIGIT
- SIGN_EXTEND_CHAR
- SIGN_EXTEND_CHAR
- REGEX_REALLOCATE
- REGEX_REALLOCATE
- FIRST_STRING_P
- TALLOC
- RETALLOC
- REGEX_TALLOC
- STREQ
- MAX
- MIN
- STORE_NUMBER
- STORE_NUMBER_AND_INCR
- EXTRACT_NUMBER
- extract_number
- EXTRACT_NUMBER
- EXTRACT_NUMBER_AND_INCR
- extract_number_and_incr
- EXTRACT_NUMBER_AND_INCR
- DEBUG_STATEMENT
- DEBUG_PRINT1
- DEBUG_PRINT2
- DEBUG_PRINT3
- DEBUG_PRINT4
- DEBUG_PRINT_COMPILED_PATTERN
- DEBUG_PRINT_DOUBLE_STRING
- printchar
- print_fastmap
- print_partial_compiled_pattern
- print_compiled_pattern
- print_double_string
- assert
- DEBUG_STATEMENT
- DEBUG_PRINT1
- DEBUG_PRINT2
- DEBUG_PRINT3
- DEBUG_PRINT4
- DEBUG_PRINT_COMPILED_PATTERN
- DEBUG_PRINT_DOUBLE_STRING
- re_set_syntax
- PATFETCH
- PATFETCH_RAW
- TRANSLATE
- GET_BUFFER_SPACE
- BUF_PUSH
- BUF_PUSH_2
- BUF_PUSH_3
- STORE_JUMP
- STORE_JUMP2
- INSERT_JUMP
- INSERT_JUMP2
- EXTEND_BUFFER
- SET_LIST_BIT
- GET_UNSIGNED_NUMBER
- IS_CHAR_CLASS
- regex_compile
- store_op1
- store_op2
- insert_op1
- insert_op2
- at_begline_loc_p
- at_endline_loc_p
- group_in_compile_stack
- compile_range
- FAIL_STACK_EMPTY
- FAIL_STACK_PTR_EMPTY
- FAIL_STACK_FULL
- FAIL_STACK_TOP
- INIT_FAIL_STACK
- DOUBLE_FAIL_STACK
- PUSH_PATTERN_OP
- PUSH_FAILURE_ITEM
- POP_FAILURE_ITEM
- DEBUG_POP
- DEBUG_PUSH
- DEBUG_POP
- PUSH_FAILURE_POINT
- POP_FAILURE_POINT
- re_compile_fastmap
- re_set_registers
- re_search
- re_search_2
- REG_MATCH_NULL_STRING_P
- IS_ACTIVE
- MATCHED_SOMETHING
- EVER_MATCHED_SOMETHING
- SET_REGS_MATCHED
- POINTER_TO_OFFSET
- REG_UNSET
- PREFETCH
- AT_STRINGS_BEG
- AT_STRINGS_END
- WORDCHAR_P
- AT_WORD_BOUNDARY
- FREE_VAR
- FREE_VARIABLES
- FREE_VARIABLES
- re_match
- re_match_2
- group_match_null_string_p
- alt_match_null_string_p
- common_op_match_null_string_p
- bcmp_translate
- re_compile_pattern
- re_comp
- re_exec
- regcomp
- regexec
- regerror
- regfree
1 /* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
5
6 Copyright (C) 1993 Free Software Foundation, Inc.
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
21
22 /* AIX requires this to be the first thing in the file. */
23 #if defined (_AIX) && !defined (REGEX_MALLOC)
24 #pragma alloca
25 #endif
26
27 #define _GNU_SOURCE
28
29 /* We need this for `regex.h', and perhaps for the Emacs include files. */
30 #include <sys/types.h>
31
32 #ifdef HAVE_CONFIG_H
33 #include "config.h"
34 #endif
35
36 /* The `emacs' switch turns on certain matching commands
37 that make sense only in Emacs. */
38 #ifdef emacs
39
40 #include "lisp.h"
41 #include "buffer.h"
42 #include "syntax.h"
43
44 /* Emacs uses `NULL' as a predicate. */
45 #undef NULL
46
47 #else /* not emacs */
48
49 /* We used to test for `BSTRING' here, but only GCC and Emacs define
50 `BSTRING', as far as I know, and neither of them use this code. */
51 #if HAVE_STRING_H || STDC_HEADERS
52 #include <string.h>
53 #ifndef bcmp
54 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
/* [<][>][^][v][top][bottom][index][help] */
55 #endif
56 #ifndef bcopy
57 #define bcopy(s, d, n) memcpy ((d), (s), (n))
/* [<][>][^][v][top][bottom][index][help] */
58 #endif
59 #ifndef bzero
60 #define bzero(s, n) memset ((s), 0, (n))
/* [<][>][^][v][top][bottom][index][help] */
61 #endif
62 #else
63 #include <strings.h>
64 #endif
65
66 #ifdef STDC_HEADERS
67 #include <stdlib.h>
68 #else
69 char *malloc ();
70 char *realloc ();
71 #endif
72
73
74 /* Define the syntax stuff for \<, \>, etc. */
75
76 /* This must be nonzero for the wordchar and notwordchar pattern
77 commands in re_match_2. */
78 #ifndef Sword
79 #define Sword 1
80 #endif
81
82 #ifdef SYNTAX_TABLE
83
84 extern char *re_syntax_table;
85
86 #else /* not SYNTAX_TABLE */
87
88 /* How many characters in the character set. */
89 #define CHAR_SET_SIZE 256
90
91 static char re_syntax_table[CHAR_SET_SIZE];
92
93 static void
94 init_syntax_once ()
/* [<][>][^][v][top][bottom][index][help] */
95 {
96 register int c;
97 static int done = 0;
98
99 if (done)
100 return;
101
102 bzero (re_syntax_table, sizeof re_syntax_table);
103
104 for (c = 'a'; c <= 'z'; c++)
105 re_syntax_table[c] = Sword;
106
107 for (c = 'A'; c <= 'Z'; c++)
108 re_syntax_table[c] = Sword;
109
110 for (c = '0'; c <= '9'; c++)
111 re_syntax_table[c] = Sword;
112
113 re_syntax_table['_'] = Sword;
114
115 done = 1;
116 }
117
118 #endif /* not SYNTAX_TABLE */
119
120 #define SYNTAX(c) re_syntax_table[c]
/* [<][>][^][v][top][bottom][index][help] */
121
122 #endif /* not emacs */
123
124 /* Get the interface, including the syntax bits. */
125 #include "regex.h"
126
127 /* isalpha etc. are used for the character classes. */
128 #include <ctype.h>
129
130 #ifndef isascii
131 #define isascii(c) 1
/* [<][>][^][v][top][bottom][index][help] */
132 #endif
133
134 #ifdef isblank
135 #define ISBLANK(c) (isascii (c) && isblank (c))
/* [<][>][^][v][top][bottom][index][help] */
136 #else
137 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
/* [<][>][^][v][top][bottom][index][help] */
138 #endif
139 #ifdef isgraph
140 #define ISGRAPH(c) (isascii (c) && isgraph (c))
/* [<][>][^][v][top][bottom][index][help] */
141 #else
142 #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c))
/* [<][>][^][v][top][bottom][index][help] */
143 #endif
144
145 #define ISPRINT(c) (isascii (c) && isprint (c))
/* [<][>][^][v][top][bottom][index][help] */
146 #define ISDIGIT(c) (isascii (c) && isdigit (c))
/* [<][>][^][v][top][bottom][index][help] */
147 #define ISALNUM(c) (isascii (c) && isalnum (c))
/* [<][>][^][v][top][bottom][index][help] */
148 #define ISALPHA(c) (isascii (c) && isalpha (c))
/* [<][>][^][v][top][bottom][index][help] */
149 #define ISCNTRL(c) (isascii (c) && iscntrl (c))
/* [<][>][^][v][top][bottom][index][help] */
150 #define ISLOWER(c) (isascii (c) && islower (c))
/* [<][>][^][v][top][bottom][index][help] */
151 #define ISPUNCT(c) (isascii (c) && ispunct (c))
/* [<][>][^][v][top][bottom][index][help] */
152 #define ISSPACE(c) (isascii (c) && isspace (c))
/* [<][>][^][v][top][bottom][index][help] */
153 #define ISUPPER(c) (isascii (c) && isupper (c))
/* [<][>][^][v][top][bottom][index][help] */
154 #define ISXDIGIT(c) (isascii (c) && isxdigit (c))
/* [<][>][^][v][top][bottom][index][help] */
155
156 #ifndef NULL
157 #define NULL 0
158 #endif
159
160 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
161 since ours (we hope) works properly with all combinations of
162 machines, compilers, `char' and `unsigned char' argument types.
163 (Per Bothner suggested the basic approach.) */
164 #undef SIGN_EXTEND_CHAR
165 #if __STDC__
166 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
/* [<][>][^][v][top][bottom][index][help] */
167 #else /* not __STDC__ */
168 /* As in Harbison and Steele. */
169 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
/* [<][>][^][v][top][bottom][index][help] */
170 #endif
171
172 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
173 use `alloca' instead of `malloc'. This is because using malloc in
174 re_search* or re_match* could cause memory leaks when C-g is used in
175 Emacs; also, malloc is slower and causes storage fragmentation. On
176 the other hand, malloc is more portable, and easier to debug.
177
178 Because we sometimes use alloca, some routines have to be macros,
179 not functions -- `alloca'-allocated space disappears at the end of the
180 function it is called in. */
181
182 #ifdef REGEX_MALLOC
183
184 #define REGEX_ALLOCATE malloc
185 #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
/* [<][>][^][v][top][bottom][index][help] */
186
187 #else /* not REGEX_MALLOC */
188
189 /* Emacs already defines alloca, sometimes. */
190 #ifndef alloca
191
192 /* Make alloca work the best possible way. */
193 #ifdef __GNUC__
194 #define alloca __builtin_alloca
195 #else /* not __GNUC__ */
196 #if HAVE_ALLOCA_H
197 #include <alloca.h>
198 #else /* not __GNUC__ or HAVE_ALLOCA_H */
199 #ifndef _AIX /* Already did AIX, up at the top. */
200 char *alloca ();
201 #endif /* not _AIX */
202 #endif /* not HAVE_ALLOCA_H */
203 #endif /* not __GNUC__ */
204
205 #endif /* not alloca */
206
207 #define REGEX_ALLOCATE alloca
208
209 /* Assumes a `char *destination' variable. */
210 #define REGEX_REALLOCATE(source, osize, nsize) \
/* [<][>][^][v][top][bottom][index][help] */
211 (destination = (char *) alloca (nsize), \
212 bcopy (source, destination, osize), \
213 destination)
214
215 #endif /* not REGEX_MALLOC */
216
217
218 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
219 `string1' or just past its end. This works if PTR is NULL, which is
220 a good thing. */
221 #define FIRST_STRING_P(ptr) \
/* [<][>][^][v][top][bottom][index][help] */
222 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
223
224 /* (Re)Allocate N items of type T using malloc, or fail. */
225 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
/* [<][>][^][v][top][bottom][index][help] */
226 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
/* [<][>][^][v][top][bottom][index][help] */
227 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
/* [<][>][^][v][top][bottom][index][help] */
228
229 #define BYTEWIDTH 8 /* In bits. */
230
231 #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
/* [<][>][^][v][top][bottom][index][help] */
232
233 #define MAX(a, b) ((a) > (b) ? (a) : (b))
/* [<][>][^][v][top][bottom][index][help] */
234 #define MIN(a, b) ((a) < (b) ? (a) : (b))
/* [<][>][^][v][top][bottom][index][help] */
235
236 typedef char boolean;
237 #define false 0
238 #define true 1
239
240 /* These are the command codes that appear in compiled regular
241 expressions. Some opcodes are followed by argument bytes. A
242 command code can specify any interpretation whatsoever for its
243 arguments. Zero bytes may appear in the compiled regular expression.
244
245 The value of `exactn' is needed in search.c (search_buffer) in Emacs.
246 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
247 `exactn' we use here must also be 1. */
248
249 typedef enum
250 {
251 no_op = 0,
252
253 /* Followed by one byte giving n, then by n literal bytes. */
254 exactn = 1,
255
256 /* Matches any (more or less) character. */
257 anychar,
258
259 /* Matches any one char belonging to specified set. First
260 following byte is number of bitmap bytes. Then come bytes
261 for a bitmap saying which chars are in. Bits in each byte
262 are ordered low-bit-first. A character is in the set if its
263 bit is 1. A character too large to have a bit in the map is
264 automatically not in the set. */
265 charset,
266
267 /* Same parameters as charset, but match any character that is
268 not one of those specified. */
269 charset_not,
270
271 /* Start remembering the text that is matched, for storing in a
272 register. Followed by one byte with the register number, in
273 the range 0 to one less than the pattern buffer's re_nsub
274 field. Then followed by one byte with the number of groups
275 inner to this one. (This last has to be part of the
276 start_memory only because we need it in the on_failure_jump
277 of re_match_2.) */
278 start_memory,
279
280 /* Stop remembering the text that is matched and store it in a
281 memory register. Followed by one byte with the register
282 number, in the range 0 to one less than `re_nsub' in the
283 pattern buffer, and one byte with the number of inner groups,
284 just like `start_memory'. (We need the number of inner
285 groups here because we don't have any easy way of finding the
286 corresponding start_memory when we're at a stop_memory.) */
287 stop_memory,
288
289 /* Match a duplicate of something remembered. Followed by one
290 byte containing the register number. */
291 duplicate,
292
293 /* Fail unless at beginning of line. */
294 begline,
295
296 /* Fail unless at end of line. */
297 endline,
298
299 /* Succeeds if at beginning of buffer (if emacs) or at beginning
300 of string to be matched (if not). */
301 begbuf,
302
303 /* Analogously, for end of buffer/string. */
304 endbuf,
305
306 /* Followed by two byte relative address to which to jump. */
307 jump,
308
309 /* Same as jump, but marks the end of an alternative. */
310 jump_past_alt,
311
312 /* Followed by two-byte relative address of place to resume at
313 in case of failure. */
314 on_failure_jump,
315
316 /* Like on_failure_jump, but pushes a placeholder instead of the
317 current string position when executed. */
318 on_failure_keep_string_jump,
319
320 /* Throw away latest failure point and then jump to following
321 two-byte relative address. */
322 pop_failure_jump,
323
324 /* Change to pop_failure_jump if know won't have to backtrack to
325 match; otherwise change to jump. This is used to jump
326 back to the beginning of a repeat. If what follows this jump
327 clearly won't match what the repeat does, such that we can be
328 sure that there is no use backtracking out of repetitions
329 already matched, then we change it to a pop_failure_jump.
330 Followed by two-byte address. */
331 maybe_pop_jump,
332
333 /* Jump to following two-byte address, and push a dummy failure
334 point. This failure point will be thrown away if an attempt
335 is made to use it for a failure. A `+' construct makes this
336 before the first repeat. Also used as an intermediary kind
337 of jump when compiling an alternative. */
338 dummy_failure_jump,
339
340 /* Push a dummy failure point and continue. Used at the end of
341 alternatives. */
342 push_dummy_failure,
343
344 /* Followed by two-byte relative address and two-byte number n.
345 After matching N times, jump to the address upon failure. */
346 succeed_n,
347
348 /* Followed by two-byte relative address, and two-byte number n.
349 Jump to the address N times, then fail. */
350 jump_n,
351
352 /* Set the following two-byte relative address to the
353 subsequent two-byte number. The address *includes* the two
354 bytes of number. */
355 set_number_at,
356
357 wordchar, /* Matches any word-constituent character. */
358 notwordchar, /* Matches any char that is not a word-constituent. */
359
360 wordbeg, /* Succeeds if at word beginning. */
361 wordend, /* Succeeds if at word end. */
362
363 wordbound, /* Succeeds if at a word boundary. */
364 notwordbound /* Succeeds if not at a word boundary. */
365
366 #ifdef emacs
367 ,before_dot, /* Succeeds if before point. */
368 at_dot, /* Succeeds if at point. */
369 after_dot, /* Succeeds if after point. */
370
371 /* Matches any character whose syntax is specified. Followed by
372 a byte which contains a syntax code, e.g., Sword. */
373 syntaxspec,
374
375 /* Matches any character whose syntax is not that specified. */
376 notsyntaxspec
377 #endif /* emacs */
378 } re_opcode_t;
379
380 /* Common operations on the compiled pattern. */
381
382 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
383
384 #define STORE_NUMBER(destination, number) \
/* [<][>][^][v][top][bottom][index][help] */
385 do { \
386 (destination)[0] = (number) & 0377; \
387 (destination)[1] = (number) >> 8; \
388 } while (0)
389
390 /* Same as STORE_NUMBER, except increment DESTINATION to
391 the byte after where the number is stored. Therefore, DESTINATION
392 must be an lvalue. */
393
394 #define STORE_NUMBER_AND_INCR(destination, number) \
/* [<][>][^][v][top][bottom][index][help] */
395 do { \
396 STORE_NUMBER (destination, number); \
397 (destination) += 2; \
398 } while (0)
399
400 /* Put into DESTINATION a number stored in two contiguous bytes starting
401 at SOURCE. */
402
403 #define EXTRACT_NUMBER(destination, source) \
/* [<][>][^][v][top][bottom][index][help] */
404 do { \
405 (destination) = *(source) & 0377; \
406 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
407 } while (0)
408
409 #ifdef DEBUG
410 static void
411 extract_number (dest, source)
/* [<][>][^][v][top][bottom][index][help] */
412 int *dest;
413 unsigned char *source;
414 {
415 int temp = SIGN_EXTEND_CHAR (*(source + 1));
416 *dest = *source & 0377;
417 *dest += temp << 8;
418 }
419
420 #ifndef EXTRACT_MACROS /* To debug the macros. */
421 #undef EXTRACT_NUMBER
422 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
/* [<][>][^][v][top][bottom][index][help] */
423 #endif /* not EXTRACT_MACROS */
424
425 #endif /* DEBUG */
426
427 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
428 SOURCE must be an lvalue. */
429
430 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
/* [<][>][^][v][top][bottom][index][help] */
431 do { \
432 EXTRACT_NUMBER (destination, source); \
433 (source) += 2; \
434 } while (0)
435
436 #ifdef DEBUG
437 static void
438 extract_number_and_incr (destination, source)
/* [<][>][^][v][top][bottom][index][help] */
439 int *destination;
440 unsigned char **source;
441 {
442 extract_number (destination, *source);
443 *source += 2;
444 }
445
446 #ifndef EXTRACT_MACROS
447 #undef EXTRACT_NUMBER_AND_INCR
448 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
/* [<][>][^][v][top][bottom][index][help] */
449 extract_number_and_incr (&dest, &src)
450 #endif /* not EXTRACT_MACROS */
451
452 #endif /* DEBUG */
453
454 /* If DEBUG is defined, Regex prints many voluminous messages about what
455 it is doing (if the variable `debug' is nonzero). If linked with the
456 main program in `iregex.c', you can enter patterns and strings
457 interactively. And if linked with the main program in `main.c' and
458 the other test files, you can run the already-written tests. */
459
460 #ifdef DEBUG
461
462 /* We use standard I/O for debugging. */
463 #include <stdio.h>
464
465 /* It is useful to test things that ``must'' be true when debugging. */
466 #include <assert.h>
467
468 static int debug = 0;
469
470 #define DEBUG_STATEMENT(e) e
/* [<][>][^][v][top][bottom][index][help] */
471 #define DEBUG_PRINT1(x) if (debug) printf (x)
/* [<][>][^][v][top][bottom][index][help] */
472 #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
/* [<][>][^][v][top][bottom][index][help] */
473 #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
/* [<][>][^][v][top][bottom][index][help] */
474 #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
/* [<][>][^][v][top][bottom][index][help] */
475 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
/* [<][>][^][v][top][bottom][index][help] */
476 if (debug) print_partial_compiled_pattern (s, e)
477 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
/* [<][>][^][v][top][bottom][index][help] */
478 if (debug) print_double_string (w, s1, sz1, s2, sz2)
479
480 // Commented out by wlee@isi.edu
481 //extern void printchar ();
482
483 // Instead, put the following in
484 static void
485 printchar (c)
/* [<][>][^][v][top][bottom][index][help] */
486 char c;
487 {
488 if (c < 040 || c >= 0177)
489 {
490 putchar ('\\');
491 putchar (((c >> 6) & 3) + '0');
492 putchar (((c >> 3) & 7) + '0');
493 putchar ((c & 7) + '0');
494 }
495 else
496 putchar (c);
497 }
498
499
500 /* Print the fastmap in human-readable form. */
501
502 void
503 print_fastmap (fastmap)
/* [<][>][^][v][top][bottom][index][help] */
504 char *fastmap;
505 {
506 unsigned was_a_range = 0;
507 unsigned i = 0;
508
509 while (i < (1 << BYTEWIDTH))
510 {
511 if (fastmap[i++])
512 {
513 was_a_range = 0;
514 printchar (i - 1);
515 while (i < (1 << BYTEWIDTH) && fastmap[i])
516 {
517 was_a_range = 1;
518 i++;
519 }
520 if (was_a_range)
521 {
522 printf ("-");
523 printchar (i - 1);
524 }
525 }
526 }
527 putchar ('\n');
528 }
529
530
531 /* Print a compiled pattern string in human-readable form, starting at
532 the START pointer into it and ending just before the pointer END. */
533
534 void
535 print_partial_compiled_pattern (start, end)
/* [<][>][^][v][top][bottom][index][help] */
536 unsigned char *start;
537 unsigned char *end;
538 {
539 int mcnt, mcnt2;
540 unsigned char *p = start;
541 unsigned char *pend = end;
542
543 if (start == NULL)
544 {
545 printf ("(null)\n");
546 return;
547 }
548
549 /* Loop over pattern commands. */
550 while (p < pend)
551 {
552 switch ((re_opcode_t) *p++)
553 {
554 case no_op:
555 printf ("/no_op");
556 break;
557
558 case exactn:
559 mcnt = *p++;
560 printf ("/exactn/%d", mcnt);
561 do
562 {
563 putchar ('/');
564 printchar (*p++);
565 }
566 while (--mcnt);
567 break;
568
569 case start_memory:
570 mcnt = *p++;
571 printf ("/start_memory/%d/%d", mcnt, *p++);
572 break;
573
574 case stop_memory:
575 mcnt = *p++;
576 printf ("/stop_memory/%d/%d", mcnt, *p++);
577 break;
578
579 case duplicate:
580 printf ("/duplicate/%d", *p++);
581 break;
582
583 case anychar:
584 printf ("/anychar");
585 break;
586
587 case charset:
588 case charset_not:
589 {
590 register int c;
591
592 printf ("/charset%s",
593 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
594
595 assert (p + *p < pend);
596
597 for (c = 0; c < *p; c++)
598 {
599 unsigned bit;
600 unsigned char map_byte = p[1 + c];
601
602 putchar ('/');
603
604 for (bit = 0; bit < BYTEWIDTH; bit++)
605 if (map_byte & (1 << bit))
606 printchar (c * BYTEWIDTH + bit);
607 }
608 p += 1 + *p;
609 break;
610 }
611
612 case begline:
613 printf ("/begline");
614 break;
615
616 case endline:
617 printf ("/endline");
618 break;
619
620 case on_failure_jump:
621 extract_number_and_incr (&mcnt, &p);
622 printf ("/on_failure_jump/0/%d", mcnt);
623 break;
624
625 case on_failure_keep_string_jump:
626 extract_number_and_incr (&mcnt, &p);
627 printf ("/on_failure_keep_string_jump/0/%d", mcnt);
628 break;
629
630 case dummy_failure_jump:
631 extract_number_and_incr (&mcnt, &p);
632 printf ("/dummy_failure_jump/0/%d", mcnt);
633 break;
634
635 case push_dummy_failure:
636 printf ("/push_dummy_failure");
637 break;
638
639 case maybe_pop_jump:
640 extract_number_and_incr (&mcnt, &p);
641 printf ("/maybe_pop_jump/0/%d", mcnt);
642 break;
643
644 case pop_failure_jump:
645 extract_number_and_incr (&mcnt, &p);
646 printf ("/pop_failure_jump/0/%d", mcnt);
647 break;
648
649 case jump_past_alt:
650 extract_number_and_incr (&mcnt, &p);
651 printf ("/jump_past_alt/0/%d", mcnt);
652 break;
653
654 case jump:
655 extract_number_and_incr (&mcnt, &p);
656 printf ("/jump/0/%d", mcnt);
657 break;
658
659 case succeed_n:
660 extract_number_and_incr (&mcnt, &p);
661 extract_number_and_incr (&mcnt2, &p);
662 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2);
663 break;
664
665 case jump_n:
666 extract_number_and_incr (&mcnt, &p);
667 extract_number_and_incr (&mcnt2, &p);
668 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2);
669 break;
670
671 case set_number_at:
672 extract_number_and_incr (&mcnt, &p);
673 extract_number_and_incr (&mcnt2, &p);
674 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2);
675 break;
676
677 case wordbound:
678 printf ("/wordbound");
679 break;
680
681 case notwordbound:
682 printf ("/notwordbound");
683 break;
684
685 case wordbeg:
686 printf ("/wordbeg");
687 break;
688
689 case wordend:
690 printf ("/wordend");
691
692 #ifdef emacs
693 case before_dot:
694 printf ("/before_dot");
695 break;
696
697 case at_dot:
698 printf ("/at_dot");
699 break;
700
701 case after_dot:
702 printf ("/after_dot");
703 break;
704
705 case syntaxspec:
706 printf ("/syntaxspec");
707 mcnt = *p++;
708 printf ("/%d", mcnt);
709 break;
710
711 case notsyntaxspec:
712 printf ("/notsyntaxspec");
713 mcnt = *p++;
714 printf ("/%d", mcnt);
715 break;
716 #endif /* emacs */
717
718 case wordchar:
719 printf ("/wordchar");
720 break;
721
722 case notwordchar:
723 printf ("/notwordchar");
724 break;
725
726 case begbuf:
727 printf ("/begbuf");
728 break;
729
730 case endbuf:
731 printf ("/endbuf");
732 break;
733
734 default:
735 printf ("?%d", *(p-1));
736 }
737 }
738 printf ("/\n");
739 }
740
741
742 void
743 print_compiled_pattern (bufp)
/* [<][>][^][v][top][bottom][index][help] */
744 struct re_pattern_buffer *bufp;
745 {
746 unsigned char *buffer = bufp->buffer;
747
748 print_partial_compiled_pattern (buffer, buffer + bufp->used);
749 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
750
751 if (bufp->fastmap_accurate && bufp->fastmap)
752 {
753 printf ("fastmap: ");
754 print_fastmap (bufp->fastmap);
755 }
756
757 printf ("re_nsub: %d\t", bufp->re_nsub);
758 printf ("regs_alloc: %d\t", bufp->regs_allocated);
759 printf ("can_be_null: %d\t", bufp->can_be_null);
760 printf ("newline_anchor: %d\n", bufp->newline_anchor);
761 printf ("no_sub: %d\t", bufp->no_sub);
762 printf ("not_bol: %d\t", bufp->not_bol);
763 printf ("not_eol: %d\t", bufp->not_eol);
764 printf ("syntax: %d\n", bufp->syntax);
765 /* Perhaps we should print the translate table? */
766 }
767
768
769 void
770 print_double_string (where, string1, size1, string2, size2)
/* [<][>][^][v][top][bottom][index][help] */
771 const char *where;
772 const char *string1;
773 const char *string2;
774 int size1;
775 int size2;
776 {
777 unsigned this_char;
778
779 if (where == NULL)
780 printf ("(null)");
781 else
782 {
783 if (FIRST_STRING_P (where))
784 {
785 for (this_char = where - string1; this_char < size1; this_char++)
786 printchar (string1[this_char]);
787
788 where = string2;
789 }
790
791 for (this_char = where - string2; this_char < size2; this_char++)
792 printchar (string2[this_char]);
793 }
794 }
795
796 #else /* not DEBUG */
797
798 #undef assert
799 #define assert(e)
/* [<][>][^][v][top][bottom][index][help] */
800
801 #define DEBUG_STATEMENT(e)
/* [<][>][^][v][top][bottom][index][help] */
802 #define DEBUG_PRINT1(x)
/* [<][>][^][v][top][bottom][index][help] */
803 #define DEBUG_PRINT2(x1, x2)
/* [<][>][^][v][top][bottom][index][help] */
804 #define DEBUG_PRINT3(x1, x2, x3)
/* [<][>][^][v][top][bottom][index][help] */
805 #define DEBUG_PRINT4(x1, x2, x3, x4)
/* [<][>][^][v][top][bottom][index][help] */
806 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
/* [<][>][^][v][top][bottom][index][help] */
807 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
/* [<][>][^][v][top][bottom][index][help] */
808
809 #endif /* not DEBUG */
810
811 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
812 also be assigned to arbitrarily: each pattern buffer stores its own
813 syntax, so it can be changed between regex compilations. */
814 reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
815
816
817 /* Specify the precise syntax of regexps for compilation. This provides
818 for compatibility for various utilities which historically have
819 different, incompatible syntaxes.
820
821 The argument SYNTAX is a bit mask comprised of the various bits
822 defined in regex.h. We return the old syntax. */
823
824 reg_syntax_t
825 re_set_syntax (syntax)
/* [<][>][^][v][top][bottom][index][help] */
826 reg_syntax_t syntax;
827 {
828 reg_syntax_t ret = re_syntax_options;
829
830 re_syntax_options = syntax;
831 return ret;
832 }
833
834 /* This table gives an error message for each of the error codes listed
835 in regex.h. Obviously the order here has to be same as there. */
836
837 static const char *re_error_msg[] =
838 { NULL, /* REG_NOERROR */
839 "No match", /* REG_NOMATCH */
840 "Invalid regular expression", /* REG_BADPAT */
841 "Invalid collation character", /* REG_ECOLLATE */
842 "Invalid character class name", /* REG_ECTYPE */
843 "Trailing backslash", /* REG_EESCAPE */
844 "Invalid back reference", /* REG_ESUBREG */
845 "Unmatched [ or [^", /* REG_EBRACK */
846 "Unmatched ( or \\(", /* REG_EPAREN */
847 "Unmatched \\{", /* REG_EBRACE */
848 "Invalid content of \\{\\}", /* REG_BADBR */
849 "Invalid range end", /* REG_ERANGE */
850 "Memory exhausted", /* REG_ESPACE */
851 "Invalid preceding regular expression", /* REG_BADRPT */
852 "Premature end of regular expression", /* REG_EEND */
853 "Regular expression too big", /* REG_ESIZE */
854 "Unmatched ) or \\)", /* REG_ERPAREN */
855 };
856
857 /* Subroutine declarations and macros for regex_compile. */
858
859 static void store_op1 (), store_op2 ();
860 static void insert_op1 (), insert_op2 ();
861 static boolean at_begline_loc_p (), at_endline_loc_p ();
862 static boolean group_in_compile_stack ();
863 static reg_errcode_t compile_range ();
864
865 /* Fetch the next character in the uncompiled pattern---translating it
866 if necessary. Also cast from a signed character in the constant
867 string passed to us by the user to an unsigned char that we can use
868 as an array index (in, e.g., `translate'). */
869 #define PATFETCH(c) \
/* [<][>][^][v][top][bottom][index][help] */
870 do {if (p == pend) return REG_EEND; \
871 c = (unsigned char) *p++; \
872 if (translate) c = translate[c]; \
873 } while (0)
874
875 /* Fetch the next character in the uncompiled pattern, with no
876 translation. */
877 #define PATFETCH_RAW(c) \
/* [<][>][^][v][top][bottom][index][help] */
878 do {if (p == pend) return REG_EEND; \
879 c = (unsigned char) *p++; \
880 } while (0)
881
882 /* Go backwards one character in the pattern. */
883 #define PATUNFETCH p--
884
885
886 /* If `translate' is non-null, return translate[D], else just D. We
887 cast the subscript to translate because some data is declared as
888 `char *', to avoid warnings when a string constant is passed. But
889 when we use a character as a subscript we must make it unsigned. */
890 #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
/* [<][>][^][v][top][bottom][index][help] */
891
892
893 /* Macros for outputting the compiled pattern into `buffer'. */
894
895 /* If the buffer isn't allocated when it comes in, use this. */
896 #define INIT_BUF_SIZE 32
897
898 /* Make sure we have at least N more bytes of space in buffer. */
899 #define GET_BUFFER_SPACE(n) \
/* [<][>][^][v][top][bottom][index][help] */
900 while (b - bufp->buffer + (n) > bufp->allocated) \
901 EXTEND_BUFFER ()
902
903 /* Make sure we have one more byte of buffer space and then add C to it. */
904 #define BUF_PUSH(c) \
/* [<][>][^][v][top][bottom][index][help] */
905 do { \
906 GET_BUFFER_SPACE (1); \
907 *b++ = (unsigned char) (c); \
908 } while (0)
909
910
911 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
912 #define BUF_PUSH_2(c1, c2) \
/* [<][>][^][v][top][bottom][index][help] */
913 do { \
914 GET_BUFFER_SPACE (2); \
915 *b++ = (unsigned char) (c1); \
916 *b++ = (unsigned char) (c2); \
917 } while (0)
918
919
920 /* As with BUF_PUSH_2, except for three bytes. */
921 #define BUF_PUSH_3(c1, c2, c3) \
/* [<][>][^][v][top][bottom][index][help] */
922 do { \
923 GET_BUFFER_SPACE (3); \
924 *b++ = (unsigned char) (c1); \
925 *b++ = (unsigned char) (c2); \
926 *b++ = (unsigned char) (c3); \
927 } while (0)
928
929
930 /* Store a jump with opcode OP at LOC to location TO. We store a
931 relative address offset by the three bytes the jump itself occupies. */
932 #define STORE_JUMP(op, loc, to) \
/* [<][>][^][v][top][bottom][index][help] */
933 store_op1 (op, loc, (to) - (loc) - 3)
934
935 /* Likewise, for a two-argument jump. */
936 #define STORE_JUMP2(op, loc, to, arg) \
/* [<][>][^][v][top][bottom][index][help] */
937 store_op2 (op, loc, (to) - (loc) - 3, arg)
938
939 /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
940 #define INSERT_JUMP(op, loc, to) \
/* [<][>][^][v][top][bottom][index][help] */
941 insert_op1 (op, loc, (to) - (loc) - 3, b)
942
943 /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
944 #define INSERT_JUMP2(op, loc, to, arg) \
/* [<][>][^][v][top][bottom][index][help] */
945 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
946
947
948 /* This is not an arbitrary limit: the arguments which represent offsets
949 into the pattern are two bytes long. So if 2^16 bytes turns out to
950 be too small, many things would have to change. */
951 #define MAX_BUF_SIZE (1L << 16)
952
953
954 /* Extend the buffer by twice its current size via realloc and
955 reset the pointers that pointed into the old block to point to the
956 correct places in the new one. If extending the buffer results in it
957 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
958 #define EXTEND_BUFFER() \
/* [<][>][^][v][top][bottom][index][help] */
959 do { \
960 unsigned char *old_buffer = bufp->buffer; \
961 if (bufp->allocated == MAX_BUF_SIZE) \
962 return REG_ESIZE; \
963 bufp->allocated <<= 1; \
964 if (bufp->allocated > MAX_BUF_SIZE) \
965 bufp->allocated = MAX_BUF_SIZE; \
966 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
967 if (bufp->buffer == NULL) \
968 return REG_ESPACE; \
969 /* If the buffer moved, move all the pointers into it. */ \
970 if (old_buffer != bufp->buffer) \
971 { \
972 b = (b - old_buffer) + bufp->buffer; \
973 begalt = (begalt - old_buffer) + bufp->buffer; \
974 if (fixup_alt_jump) \
975 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
976 if (laststart) \
977 laststart = (laststart - old_buffer) + bufp->buffer; \
978 if (pending_exact) \
979 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
980 } \
981 } while (0)
982
983
984 /* Since we have one byte reserved for the register number argument to
985 {start,stop}_memory, the maximum number of groups we can report
986 things about is what fits in that byte. */
987 #define MAX_REGNUM 255
988
989 /* But patterns can have more than `MAX_REGNUM' registers. We just
990 ignore the excess. */
991 typedef unsigned regnum_t;
992
993
994 /* Macros for the compile stack. */
995
996 /* Since offsets can go either forwards or backwards, this type needs to
997 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
998 typedef int pattern_offset_t;
999
1000 typedef struct
1001 {
1002 pattern_offset_t begalt_offset;
1003 pattern_offset_t fixup_alt_jump;
1004 pattern_offset_t inner_group_offset;
1005 pattern_offset_t laststart_offset;
1006 regnum_t regnum;
1007 } compile_stack_elt_t;
1008
1009
1010 typedef struct
1011 {
1012 compile_stack_elt_t *stack;
1013 unsigned size;
1014 unsigned avail; /* Offset of next open position. */
1015 } compile_stack_type;
1016
1017
1018 #define INIT_COMPILE_STACK_SIZE 32
1019
1020 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1021 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1022
1023 /* The next available element. */
1024 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1025
1026
1027 /* Set the bit for character C in a list. */
1028 #define SET_LIST_BIT(c) \
/* [<][>][^][v][top][bottom][index][help] */
1029 (b[((unsigned char) (c)) / BYTEWIDTH] \
1030 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1031
1032
1033 /* Get the next unsigned number in the uncompiled pattern. */
1034 #define GET_UNSIGNED_NUMBER(num) \
/* [<][>][^][v][top][bottom][index][help] */
1035 { if (p != pend) \
1036 { \
1037 PATFETCH (c); \
1038 while (ISDIGIT (c)) \
1039 { \
1040 if (num < 0) \
1041 num = 0; \
1042 num = num * 10 + c - '0'; \
1043 if (p == pend) \
1044 break; \
1045 PATFETCH (c); \
1046 } \
1047 } \
1048 }
1049
1050 #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1051
1052 #define IS_CHAR_CLASS(string) \
/* [<][>][^][v][top][bottom][index][help] */
1053 (STREQ (string, "alpha") || STREQ (string, "upper") \
1054 || STREQ (string, "lower") || STREQ (string, "digit") \
1055 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1056 || STREQ (string, "space") || STREQ (string, "print") \
1057 || STREQ (string, "punct") || STREQ (string, "graph") \
1058 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1059
1060 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1061 Returns one of error codes defined in `regex.h', or zero for success.
1062
1063 Assumes the `allocated' (and perhaps `buffer') and `translate'
1064 fields are set in BUFP on entry.
1065
1066 If it succeeds, results are put in BUFP (if it returns an error, the
1067 contents of BUFP are undefined):
1068 `buffer' is the compiled pattern;
1069 `syntax' is set to SYNTAX;
1070 `used' is set to the length of the compiled pattern;
1071 `fastmap_accurate' is zero;
1072 `re_nsub' is the number of subexpressions in PATTERN;
1073 `not_bol' and `not_eol' are zero;
1074
1075 The `fastmap' and `newline_anchor' fields are neither
1076 examined nor set. */
1077
1078 static reg_errcode_t
1079 regex_compile (pattern, size, syntax, bufp)
/* [<][>][^][v][top][bottom][index][help] */
1080 const char *pattern;
1081 int size;
1082 reg_syntax_t syntax;
1083 struct re_pattern_buffer *bufp;
1084 {
1085 /* We fetch characters from PATTERN here. Even though PATTERN is
1086 `char *' (i.e., signed), we declare these variables as unsigned, so
1087 they can be reliably used as array indices. */
1088 register unsigned char c, c1;
1089
1090 /* A random tempory spot in PATTERN. */
1091 const char *p1;
1092
1093 /* Points to the end of the buffer, where we should append. */
1094 register unsigned char *b;
1095
1096 /* Keeps track of unclosed groups. */
1097 compile_stack_type compile_stack;
1098
1099 /* Points to the current (ending) position in the pattern. */
1100 const char *p = pattern;
1101 const char *pend = pattern + size;
1102
1103 /* How to translate the characters in the pattern. */
1104 char *translate = bufp->translate;
1105
1106 /* Address of the count-byte of the most recently inserted `exactn'
1107 command. This makes it possible to tell if a new exact-match
1108 character can be added to that command or if the character requires
1109 a new `exactn' command. */
1110 unsigned char *pending_exact = 0;
1111
1112 /* Address of start of the most recently finished expression.
1113 This tells, e.g., postfix * where to find the start of its
1114 operand. Reset at the beginning of groups and alternatives. */
1115 unsigned char *laststart = 0;
1116
1117 /* Address of beginning of regexp, or inside of last group. */
1118 unsigned char *begalt;
1119
1120 /* Place in the uncompiled pattern (i.e., the {) to
1121 which to go back if the interval is invalid. */
1122 const char *beg_interval;
1123
1124 /* Address of the place where a forward jump should go to the end of
1125 the containing expression. Each alternative of an `or' -- except the
1126 last -- ends with a forward jump of this sort. */
1127 unsigned char *fixup_alt_jump = 0;
1128
1129 /* Counts open-groups as they are encountered. Remembered for the
1130 matching close-group on the compile stack, so the same register
1131 number is put in the stop_memory as the start_memory. */
1132 regnum_t regnum = 0;
1133
1134 #ifdef DEBUG
1135 DEBUG_PRINT1 ("\nCompiling pattern: ");
1136 if (debug)
1137 {
1138 unsigned debug_count;
1139
1140 for (debug_count = 0; debug_count < size; debug_count++)
1141 printchar (pattern[debug_count]);
1142 putchar ('\n');
1143 }
1144 #endif /* DEBUG */
1145
1146 /* Initialize the compile stack. */
1147 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1148 if (compile_stack.stack == NULL)
1149 return REG_ESPACE;
1150
1151 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1152 compile_stack.avail = 0;
1153
1154 /* Initialize the pattern buffer. */
1155 bufp->syntax = syntax;
1156 bufp->fastmap_accurate = 0;
1157 bufp->not_bol = bufp->not_eol = 0;
1158
1159 /* Set `used' to zero, so that if we return an error, the pattern
1160 printer (for debugging) will think there's no pattern. We reset it
1161 at the end. */
1162 bufp->used = 0;
1163
1164 /* Always count groups, whether or not bufp->no_sub is set. */
1165 bufp->re_nsub = 0;
1166
1167 #if !defined (emacs) && !defined (SYNTAX_TABLE)
1168 /* Initialize the syntax table. */
1169 init_syntax_once ();
1170 #endif
1171
1172 if (bufp->allocated == 0)
1173 {
1174 if (bufp->buffer)
1175 { /* If zero allocated, but buffer is non-null, try to realloc
1176 enough space. This loses if buffer's address is bogus, but
1177 that is the user's responsibility. */
1178 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1179 }
1180 else
1181 { /* Caller did not allocate a buffer. Do it for them. */
1182 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1183 }
1184 if (!bufp->buffer) return REG_ESPACE;
1185
1186 bufp->allocated = INIT_BUF_SIZE;
1187 }
1188
1189 begalt = b = bufp->buffer;
1190
1191 /* Loop through the uncompiled pattern until we're at the end. */
1192 while (p != pend)
1193 {
1194 PATFETCH (c);
1195
1196 switch (c)
1197 {
1198 case '^':
1199 {
1200 if ( /* If at start of pattern, it's an operator. */
1201 p == pattern + 1
1202 /* If context independent, it's an operator. */
1203 || syntax & RE_CONTEXT_INDEP_ANCHORS
1204 /* Otherwise, depends on what's come before. */
1205 || at_begline_loc_p (pattern, p, syntax))
1206 BUF_PUSH (begline);
1207 else
1208 goto normal_char;
1209 }
1210 break;
1211
1212
1213 case '$':
1214 {
1215 if ( /* If at end of pattern, it's an operator. */
1216 p == pend
1217 /* If context independent, it's an operator. */
1218 || syntax & RE_CONTEXT_INDEP_ANCHORS
1219 /* Otherwise, depends on what's next. */
1220 || at_endline_loc_p (p, pend, syntax))
1221 BUF_PUSH (endline);
1222 else
1223 goto normal_char;
1224 }
1225 break;
1226
1227
1228 case '+':
1229 case '?':
1230 if ((syntax & RE_BK_PLUS_QM)
1231 || (syntax & RE_LIMITED_OPS))
1232 goto normal_char;
1233 handle_plus:
1234 case '*':
1235 /* If there is no previous pattern... */
1236 if (!laststart)
1237 {
1238 if (syntax & RE_CONTEXT_INVALID_OPS)
1239 return REG_BADRPT;
1240 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1241 goto normal_char;
1242 }
1243
1244 {
1245 /* Are we optimizing this jump? */
1246 boolean keep_string_p = false;
1247
1248 /* 1 means zero (many) matches is allowed. */
1249 char zero_times_ok = 0, many_times_ok = 0;
1250
1251 /* If there is a sequence of repetition chars, collapse it
1252 down to just one (the right one). We can't combine
1253 interval operators with these because of, e.g., `a{2}*',
1254 which should only match an even number of `a's. */
1255
1256 for (;;)
1257 {
1258 zero_times_ok |= c != '+';
1259 many_times_ok |= c != '?';
1260
1261 if (p == pend)
1262 break;
1263
1264 PATFETCH (c);
1265
1266 if (c == '*'
1267 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1268 ;
1269
1270 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1271 {
1272 if (p == pend) return REG_EESCAPE;
1273
1274 PATFETCH (c1);
1275 if (!(c1 == '+' || c1 == '?'))
1276 {
1277 PATUNFETCH;
1278 PATUNFETCH;
1279 break;
1280 }
1281
1282 c = c1;
1283 }
1284 else
1285 {
1286 PATUNFETCH;
1287 break;
1288 }
1289
1290 /* If we get here, we found another repeat character. */
1291 }
1292
1293 /* Star, etc. applied to an empty pattern is equivalent
1294 to an empty pattern. */
1295 if (!laststart)
1296 break;
1297
1298 /* Now we know whether or not zero matches is allowed
1299 and also whether or not two or more matches is allowed. */
1300 if (many_times_ok)
1301 { /* More than one repetition is allowed, so put in at the
1302 end a backward relative jump from `b' to before the next
1303 jump we're going to put in below (which jumps from
1304 laststart to after this jump).
1305
1306 But if we are at the `*' in the exact sequence `.*\n',
1307 insert an unconditional jump backwards to the .,
1308 instead of the beginning of the loop. This way we only
1309 push a failure point once, instead of every time
1310 through the loop. */
1311 assert (p - 1 > pattern);
1312
1313 /* Allocate the space for the jump. */
1314 GET_BUFFER_SPACE (3);
1315
1316 /* We know we are not at the first character of the pattern,
1317 because laststart was nonzero. And we've already
1318 incremented `p', by the way, to be the character after
1319 the `*'. Do we have to do something analogous here
1320 for null bytes, because of RE_DOT_NOT_NULL? */
1321 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1322 && zero_times_ok
1323 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1324 && !(syntax & RE_DOT_NEWLINE))
1325 { /* We have .*\n. */
1326 STORE_JUMP (jump, b, laststart);
1327 keep_string_p = true;
1328 }
1329 else
1330 /* Anything else. */
1331 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1332
1333 /* We've added more stuff to the buffer. */
1334 b += 3;
1335 }
1336
1337 /* On failure, jump from laststart to b + 3, which will be the
1338 end of the buffer after this jump is inserted. */
1339 GET_BUFFER_SPACE (3);
1340 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1341 : on_failure_jump,
1342 laststart, b + 3);
1343 pending_exact = 0;
1344 b += 3;
1345
1346 if (!zero_times_ok)
1347 {
1348 /* At least one repetition is required, so insert a
1349 `dummy_failure_jump' before the initial
1350 `on_failure_jump' instruction of the loop. This
1351 effects a skip over that instruction the first time
1352 we hit that loop. */
1353 GET_BUFFER_SPACE (3);
1354 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1355 b += 3;
1356 }
1357 }
1358 break;
1359
1360
1361 case '.':
1362 laststart = b;
1363 BUF_PUSH (anychar);
1364 break;
1365
1366
1367 case '[':
1368 {
1369 boolean had_char_class = false;
1370
1371 if (p == pend) return REG_EBRACK;
1372
1373 /* Ensure that we have enough space to push a charset: the
1374 opcode, the length count, and the bitset; 34 bytes in all. */
1375 GET_BUFFER_SPACE (34);
1376
1377 laststart = b;
1378
1379 /* We test `*p == '^' twice, instead of using an if
1380 statement, so we only need one BUF_PUSH. */
1381 BUF_PUSH (*p == '^' ? charset_not : charset);
1382 if (*p == '^')
1383 p++;
1384
1385 /* Remember the first position in the bracket expression. */
1386 p1 = p;
1387
1388 /* Push the number of bytes in the bitmap. */
1389 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
1390
1391 /* Clear the whole map. */
1392 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
1393
1394 /* charset_not matches newline according to a syntax bit. */
1395 if ((re_opcode_t) b[-2] == charset_not
1396 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
1397 SET_LIST_BIT ('\n');
1398
1399 /* Read in characters and ranges, setting map bits. */
1400 for (;;)
1401 {
1402 if (p == pend) return REG_EBRACK;
1403
1404 PATFETCH (c);
1405
1406 /* \ might escape characters inside [...] and [^...]. */
1407 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
1408 {
1409 if (p == pend) return REG_EESCAPE;
1410
1411 PATFETCH (c1);
1412 SET_LIST_BIT (c1);
1413 continue;
1414 }
1415
1416 /* Could be the end of the bracket expression. If it's
1417 not (i.e., when the bracket expression is `[]' so
1418 far), the ']' character bit gets set way below. */
1419 if (c == ']' && p != p1 + 1)
1420 break;
1421
1422 /* Look ahead to see if it's a range when the last thing
1423 was a character class. */
1424 if (had_char_class && c == '-' && *p != ']')
1425 return REG_ERANGE;
1426
1427 /* Look ahead to see if it's a range when the last thing
1428 was a character: if this is a hyphen not at the
1429 beginning or the end of a list, then it's the range
1430 operator. */
1431 if (c == '-'
1432 && !(p - 2 >= pattern && p[-2] == '[')
1433 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
1434 && *p != ']')
1435 {
1436 reg_errcode_t ret
1437 = compile_range (&p, pend, translate, syntax, b);
1438 if (ret != REG_NOERROR) return ret;
1439 }
1440
1441 else if (p[0] == '-' && p[1] != ']')
1442 { /* This handles ranges made up of characters only. */
1443 reg_errcode_t ret;
1444
1445 /* Move past the `-'. */
1446 PATFETCH (c1);
1447
1448 ret = compile_range (&p, pend, translate, syntax, b);
1449 if (ret != REG_NOERROR) return ret;
1450 }
1451
1452 /* See if we're at the beginning of a possible character
1453 class. */
1454
1455 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
1456 { /* Leave room for the null. */
1457 char str[CHAR_CLASS_MAX_LENGTH + 1];
1458
1459 PATFETCH (c);
1460 c1 = 0;
1461
1462 /* If pattern is `[[:'. */
1463 if (p == pend) return REG_EBRACK;
1464
1465 for (;;)
1466 {
1467 PATFETCH (c);
1468 if (c == ':' || c == ']' || p == pend
1469 || c1 == CHAR_CLASS_MAX_LENGTH)
1470 break;
1471 str[c1++] = c;
1472 }
1473 str[c1] = '\0';
1474
1475 /* If isn't a word bracketed by `[:' and:`]':
1476 undo the ending character, the letters, and leave
1477 the leading `:' and `[' (but set bits for them). */
1478 if (c == ':' && *p == ']')
1479 {
1480 int ch;
1481 boolean is_alnum = STREQ (str, "alnum");
1482 boolean is_alpha = STREQ (str, "alpha");
1483 boolean is_blank = STREQ (str, "blank");
1484 boolean is_cntrl = STREQ (str, "cntrl");
1485 boolean is_digit = STREQ (str, "digit");
1486 boolean is_graph = STREQ (str, "graph");
1487 boolean is_lower = STREQ (str, "lower");
1488 boolean is_print = STREQ (str, "print");
1489 boolean is_punct = STREQ (str, "punct");
1490 boolean is_space = STREQ (str, "space");
1491 boolean is_upper = STREQ (str, "upper");
1492 boolean is_xdigit = STREQ (str, "xdigit");
1493
1494 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
1495
1496 /* Throw away the ] at the end of the character
1497 class. */
1498 PATFETCH (c);
1499
1500 if (p == pend) return REG_EBRACK;
1501
1502 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
1503 {
1504 if ( (is_alnum && ISALNUM (ch))
1505 || (is_alpha && ISALPHA (ch))
1506 || (is_blank && ISBLANK (ch))
1507 || (is_cntrl && ISCNTRL (ch))
1508 || (is_digit && ISDIGIT (ch))
1509 || (is_graph && ISGRAPH (ch))
1510 || (is_lower && ISLOWER (ch))
1511 || (is_print && ISPRINT (ch))
1512 || (is_punct && ISPUNCT (ch))
1513 || (is_space && ISSPACE (ch))
1514 || (is_upper && ISUPPER (ch))
1515 || (is_xdigit && ISXDIGIT (ch)))
1516 SET_LIST_BIT (ch);
1517 }
1518 had_char_class = true;
1519 }
1520 else
1521 {
1522 c1++;
1523 while (c1--)
1524 PATUNFETCH;
1525 SET_LIST_BIT ('[');
1526 SET_LIST_BIT (':');
1527 had_char_class = false;
1528 }
1529 }
1530 else
1531 {
1532 had_char_class = false;
1533 SET_LIST_BIT (c);
1534 }
1535 }
1536
1537 /* Discard any (non)matching list bytes that are all 0 at the
1538 end of the map. Decrease the map-length byte too. */
1539 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
1540 b[-1]--;
1541 b += b[-1];
1542 }
1543 break;
1544
1545
1546 case '(':
1547 if (syntax & RE_NO_BK_PARENS)
1548 goto handle_open;
1549 else
1550 goto normal_char;
1551
1552
1553 case ')':
1554 if (syntax & RE_NO_BK_PARENS)
1555 goto handle_close;
1556 else
1557 goto normal_char;
1558
1559
1560 case '\n':
1561 if (syntax & RE_NEWLINE_ALT)
1562 goto handle_alt;
1563 else
1564 goto normal_char;
1565
1566
1567 case '|':
1568 if (syntax & RE_NO_BK_VBAR)
1569 goto handle_alt;
1570 else
1571 goto normal_char;
1572
1573
1574 case '{':
1575 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
1576 goto handle_interval;
1577 else
1578 goto normal_char;
1579
1580
1581 case '\\':
1582 if (p == pend) return REG_EESCAPE;
1583
1584 /* Do not translate the character after the \, so that we can
1585 distinguish, e.g., \B from \b, even if we normally would
1586 translate, e.g., B to b. */
1587 PATFETCH_RAW (c);
1588
1589 switch (c)
1590 {
1591 case '(':
1592 if (syntax & RE_NO_BK_PARENS)
1593 goto normal_backslash;
1594
1595 handle_open:
1596 bufp->re_nsub++;
1597 regnum++;
1598
1599 if (COMPILE_STACK_FULL)
1600 {
1601 RETALLOC (compile_stack.stack, compile_stack.size << 1,
1602 compile_stack_elt_t);
1603 if (compile_stack.stack == NULL) return REG_ESPACE;
1604
1605 compile_stack.size <<= 1;
1606 }
1607
1608 /* These are the values to restore when we hit end of this
1609 group. They are all relative offsets, so that if the
1610 whole pattern moves because of realloc, they will still
1611 be valid. */
1612 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
1613 COMPILE_STACK_TOP.fixup_alt_jump
1614 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1615 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
1616 COMPILE_STACK_TOP.regnum = regnum;
1617
1618 /* We will eventually replace the 0 with the number of
1619 groups inner to this one. But do not push a
1620 start_memory for groups beyond the last one we can
1621 represent in the compiled pattern. */
1622 if (regnum <= MAX_REGNUM)
1623 {
1624 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
1625 BUF_PUSH_3 (start_memory, regnum, 0);
1626 }
1627
1628 compile_stack.avail++;
1629
1630 fixup_alt_jump = 0;
1631 laststart = 0;
1632 begalt = b;
1633 /* If we've reached MAX_REGNUM groups, then this open
1634 won't actually generate any code, so we'll have to
1635 clear pending_exact explicitly. */
1636 pending_exact = 0;
1637 break;
1638
1639
1640 case ')':
1641 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
1642
1643 if (COMPILE_STACK_EMPTY)
1644 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1645 goto normal_backslash;
1646 else
1647 return REG_ERPAREN;
1648
1649 handle_close:
1650 if (fixup_alt_jump)
1651 { /* Push a dummy failure point at the end of the
1652 alternative for a possible future
1653 `pop_failure_jump' to pop. See comments at
1654 `push_dummy_failure' in `re_match_2'. */
1655 BUF_PUSH (push_dummy_failure);
1656
1657 /* We allocated space for this jump when we assigned
1658 to `fixup_alt_jump', in the `handle_alt' case below. */
1659 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
1660 }
1661
1662 /* See similar code for backslashed left paren above. */
1663 if (COMPILE_STACK_EMPTY)
1664 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1665 goto normal_char;
1666 else
1667 return REG_ERPAREN;
1668
1669 /* Since we just checked for an empty stack above, this
1670 ``can't happen''. */
1671 assert (compile_stack.avail != 0);
1672 {
1673 /* We don't just want to restore into `regnum', because
1674 later groups should continue to be numbered higher,
1675 as in `(ab)c(de)' -- the second group is #2. */
1676 regnum_t this_group_regnum;
1677
1678 compile_stack.avail--;
1679 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
1680 fixup_alt_jump
1681 = COMPILE_STACK_TOP.fixup_alt_jump
1682 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
1683 : 0;
1684 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
1685 this_group_regnum = COMPILE_STACK_TOP.regnum;
1686 /* If we've reached MAX_REGNUM groups, then this open
1687 won't actually generate any code, so we'll have to
1688 clear pending_exact explicitly. */
1689 pending_exact = 0;
1690
1691 /* We're at the end of the group, so now we know how many
1692 groups were inside this one. */
1693 if (this_group_regnum <= MAX_REGNUM)
1694 {
1695 unsigned char *inner_group_loc
1696 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
1697
1698 *inner_group_loc = regnum - this_group_regnum;
1699 BUF_PUSH_3 (stop_memory, this_group_regnum,
1700 regnum - this_group_regnum);
1701 }
1702 }
1703 break;
1704
1705
1706 case '|': /* `\|'. */
1707 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
1708 goto normal_backslash;
1709 handle_alt:
1710 if (syntax & RE_LIMITED_OPS)
1711 goto normal_char;
1712
1713 /* Insert before the previous alternative a jump which
1714 jumps to this alternative if the former fails. */
1715 GET_BUFFER_SPACE (3);
1716 INSERT_JUMP (on_failure_jump, begalt, b + 6);
1717 pending_exact = 0;
1718 b += 3;
1719
1720 /* The alternative before this one has a jump after it
1721 which gets executed if it gets matched. Adjust that
1722 jump so it will jump to this alternative's analogous
1723 jump (put in below, which in turn will jump to the next
1724 (if any) alternative's such jump, etc.). The last such
1725 jump jumps to the correct final destination. A picture:
1726 _____ _____
1727 | | | |
1728 | v | v
1729 a | b | c
1730
1731 If we are at `b', then fixup_alt_jump right now points to a
1732 three-byte space after `a'. We'll put in the jump, set
1733 fixup_alt_jump to right after `b', and leave behind three
1734 bytes which we'll fill in when we get to after `c'. */
1735
1736 if (fixup_alt_jump)
1737 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
1738
1739 /* Mark and leave space for a jump after this alternative,
1740 to be filled in later either by next alternative or
1741 when know we're at the end of a series of alternatives. */
1742 fixup_alt_jump = b;
1743 GET_BUFFER_SPACE (3);
1744 b += 3;
1745
1746 laststart = 0;
1747 begalt = b;
1748 break;
1749
1750
1751 case '{':
1752 /* If \{ is a literal. */
1753 if (!(syntax & RE_INTERVALS)
1754 /* If we're at `\{' and it's not the open-interval
1755 operator. */
1756 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1757 || (p - 2 == pattern && p == pend))
1758 goto normal_backslash;
1759
1760 handle_interval:
1761 {
1762 /* If got here, then the syntax allows intervals. */
1763
1764 /* At least (most) this many matches must be made. */
1765 int lower_bound = -1, upper_bound = -1;
1766
1767 beg_interval = p - 1;
1768
1769 if (p == pend)
1770 {
1771 if (syntax & RE_NO_BK_BRACES)
1772 goto unfetch_interval;
1773 else
1774 return REG_EBRACE;
1775 }
1776
1777 GET_UNSIGNED_NUMBER (lower_bound);
1778
1779 if (c == ',')
1780 {
1781 GET_UNSIGNED_NUMBER (upper_bound);
1782 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1783 }
1784 else
1785 /* Interval such as `{1}' => match exactly once. */
1786 upper_bound = lower_bound;
1787
1788 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
1789 || lower_bound > upper_bound)
1790 {
1791 if (syntax & RE_NO_BK_BRACES)
1792 goto unfetch_interval;
1793 else
1794 return REG_BADBR;
1795 }
1796
1797 if (!(syntax & RE_NO_BK_BRACES))
1798 {
1799 if (c != '\\') return REG_EBRACE;
1800
1801 PATFETCH (c);
1802 }
1803
1804 if (c != '}')
1805 {
1806 if (syntax & RE_NO_BK_BRACES)
1807 goto unfetch_interval;
1808 else
1809 return REG_BADBR;
1810 }
1811
1812 /* We just parsed a valid interval. */
1813
1814 /* If it's invalid to have no preceding re. */
1815 if (!laststart)
1816 {
1817 if (syntax & RE_CONTEXT_INVALID_OPS)
1818 return REG_BADRPT;
1819 else if (syntax & RE_CONTEXT_INDEP_OPS)
1820 laststart = b;
1821 else
1822 goto unfetch_interval;
1823 }
1824
1825 /* If the upper bound is zero, don't want to succeed at
1826 all; jump from `laststart' to `b + 3', which will be
1827 the end of the buffer after we insert the jump. */
1828 if (upper_bound == 0)
1829 {
1830 GET_BUFFER_SPACE (3);
1831 INSERT_JUMP (jump, laststart, b + 3);
1832 b += 3;
1833 }
1834
1835 /* Otherwise, we have a nontrivial interval. When
1836 we're all done, the pattern will look like:
1837 set_number_at <jump count> <upper bound>
1838 set_number_at <succeed_n count> <lower bound>
1839 succeed_n <after jump addr> <succed_n count>
1840 <body of loop>
1841 jump_n <succeed_n addr> <jump count>
1842 (The upper bound and `jump_n' are omitted if
1843 `upper_bound' is 1, though.) */
1844 else
1845 { /* If the upper bound is > 1, we need to insert
1846 more at the end of the loop. */
1847 unsigned nbytes = 10 + (upper_bound > 1) * 10;
1848
1849 GET_BUFFER_SPACE (nbytes);
1850
1851 /* Initialize lower bound of the `succeed_n', even
1852 though it will be set during matching by its
1853 attendant `set_number_at' (inserted next),
1854 because `re_compile_fastmap' needs to know.
1855 Jump to the `jump_n' we might insert below. */
1856 INSERT_JUMP2 (succeed_n, laststart,
1857 b + 5 + (upper_bound > 1) * 5,
1858 lower_bound);
1859 b += 5;
1860
1861 /* Code to initialize the lower bound. Insert
1862 before the `succeed_n'. The `5' is the last two
1863 bytes of this `set_number_at', plus 3 bytes of
1864 the following `succeed_n'. */
1865 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
1866 b += 5;
1867
1868 if (upper_bound > 1)
1869 { /* More than one repetition is allowed, so
1870 append a backward jump to the `succeed_n'
1871 that starts this interval.
1872
1873 When we've reached this during matching,
1874 we'll have matched the interval once, so
1875 jump back only `upper_bound - 1' times. */
1876 STORE_JUMP2 (jump_n, b, laststart + 5,
1877 upper_bound - 1);
1878 b += 5;
1879
1880 /* The location we want to set is the second
1881 parameter of the `jump_n'; that is `b-2' as
1882 an absolute address. `laststart' will be
1883 the `set_number_at' we're about to insert;
1884 `laststart+3' the number to set, the source
1885 for the relative address. But we are
1886 inserting into the middle of the pattern --
1887 so everything is getting moved up by 5.
1888 Conclusion: (b - 2) - (laststart + 3) + 5,
1889 i.e., b - laststart.
1890
1891 We insert this at the beginning of the loop
1892 so that if we fail during matching, we'll
1893 reinitialize the bounds. */
1894 insert_op2 (set_number_at, laststart, b - laststart,
1895 upper_bound - 1, b);
1896 b += 5;
1897 }
1898 }
1899 pending_exact = 0;
1900 beg_interval = NULL;
1901 }
1902 break;
1903
1904 unfetch_interval:
1905 /* If an invalid interval, match the characters as literals. */
1906 assert (beg_interval);
1907 p = beg_interval;
1908 beg_interval = NULL;
1909
1910 /* normal_char and normal_backslash need `c'. */
1911 PATFETCH (c);
1912
1913 if (!(syntax & RE_NO_BK_BRACES))
1914 {
1915 if (p > pattern && p[-1] == '\\')
1916 goto normal_backslash;
1917 }
1918 goto normal_char;
1919
1920 #ifdef emacs
1921 /* There is no way to specify the before_dot and after_dot
1922 operators. rms says this is ok. --karl */
1923 case '=':
1924 BUF_PUSH (at_dot);
1925 break;
1926
1927 case 's':
1928 laststart = b;
1929 PATFETCH (c);
1930 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
1931 break;
1932
1933 case 'S':
1934 laststart = b;
1935 PATFETCH (c);
1936 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
1937 break;
1938 #endif /* emacs */
1939
1940
1941 case 'w':
1942 laststart = b;
1943 BUF_PUSH (wordchar);
1944 break;
1945
1946
1947 case 'W':
1948 laststart = b;
1949 BUF_PUSH (notwordchar);
1950 break;
1951
1952
1953 case '<':
1954 BUF_PUSH (wordbeg);
1955 break;
1956
1957 case '>':
1958 BUF_PUSH (wordend);
1959 break;
1960
1961 case 'b':
1962 BUF_PUSH (wordbound);
1963 break;
1964
1965 case 'B':
1966 BUF_PUSH (notwordbound);
1967 break;
1968
1969 case '`':
1970 BUF_PUSH (begbuf);
1971 break;
1972
1973 case '\'':
1974 BUF_PUSH (endbuf);
1975 break;
1976
1977 case '1': case '2': case '3': case '4': case '5':
1978 case '6': case '7': case '8': case '9':
1979 if (syntax & RE_NO_BK_REFS)
1980 goto normal_char;
1981
1982 c1 = c - '0';
1983
1984 if (c1 > regnum)
1985 return REG_ESUBREG;
1986
1987 /* Can't back reference to a subexpression if inside of it. */
1988 if (group_in_compile_stack (compile_stack, c1))
1989 goto normal_char;
1990
1991 laststart = b;
1992 BUF_PUSH_2 (duplicate, c1);
1993 break;
1994
1995
1996 case '+':
1997 case '?':
1998 if (syntax & RE_BK_PLUS_QM)
1999 goto handle_plus;
2000 else
2001 goto normal_backslash;
2002
2003 default:
2004 normal_backslash:
2005 /* You might think it would be useful for \ to mean
2006 not to translate; but if we don't translate it
2007 it will never match anything. */
2008 c = TRANSLATE (c);
2009 goto normal_char;
2010 }
2011 break;
2012
2013
2014 default:
2015 /* Expects the character in `c'. */
2016 normal_char:
2017 /* If no exactn currently being built. */
2018 if (!pending_exact
2019
2020 /* If last exactn not at current position. */
2021 || pending_exact + *pending_exact + 1 != b
2022
2023 /* We have only one byte following the exactn for the count. */
2024 || *pending_exact == (1 << BYTEWIDTH) - 1
2025
2026 /* If followed by a repetition operator. */
2027 || *p == '*' || *p == '^'
2028 || ((syntax & RE_BK_PLUS_QM)
2029 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2030 : (*p == '+' || *p == '?'))
2031 || ((syntax & RE_INTERVALS)
2032 && ((syntax & RE_NO_BK_BRACES)
2033 ? *p == '{'
2034 : (p[0] == '\\' && p[1] == '{'))))
2035 {
2036 /* Start building a new exactn. */
2037
2038 laststart = b;
2039
2040 BUF_PUSH_2 (exactn, 0);
2041 pending_exact = b - 1;
2042 }
2043
2044 BUF_PUSH (c);
2045 (*pending_exact)++;
2046 break;
2047 } /* switch (c) */
2048 } /* while p != pend */
2049
2050
2051 /* Through the pattern now. */
2052
2053 if (fixup_alt_jump)
2054 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2055
2056 if (!COMPILE_STACK_EMPTY)
2057 return REG_EPAREN;
2058
2059 free (compile_stack.stack);
2060
2061 /* We have succeeded; set the length of the buffer. */
2062 bufp->used = b - bufp->buffer;
2063
2064 #ifdef DEBUG
2065 if (debug)
2066 {
2067 DEBUG_PRINT1 ("\nCompiled pattern: ");
2068 print_compiled_pattern (bufp);
2069 }
2070 #endif /* DEBUG */
2071
2072 return REG_NOERROR;
2073 } /* regex_compile */
2074
2075 /* Subroutines for `regex_compile'. */
2076
2077 /* Store OP at LOC followed by two-byte integer parameter ARG. */
2078
2079 static void
2080 store_op1 (op, loc, arg)
/* [<][>][^][v][top][bottom][index][help] */
2081 re_opcode_t op;
2082 unsigned char *loc;
2083 int arg;
2084 {
2085 *loc = (unsigned char) op;
2086 STORE_NUMBER (loc + 1, arg);
2087 }
2088
2089
2090 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2091
2092 static void
2093 store_op2 (op, loc, arg1, arg2)
/* [<][>][^][v][top][bottom][index][help] */
2094 re_opcode_t op;
2095 unsigned char *loc;
2096 int arg1, arg2;
2097 {
2098 *loc = (unsigned char) op;
2099 STORE_NUMBER (loc + 1, arg1);
2100 STORE_NUMBER (loc + 3, arg2);
2101 }
2102
2103
2104 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
2105 for OP followed by two-byte integer parameter ARG. */
2106
2107 static void
2108 insert_op1 (op, loc, arg, end)
/* [<][>][^][v][top][bottom][index][help] */
2109 re_opcode_t op;
2110 unsigned char *loc;
2111 int arg;
2112 unsigned char *end;
2113 {
2114 register unsigned char *pfrom = end;
2115 register unsigned char *pto = end + 3;
2116
2117 while (pfrom != loc)
2118 *--pto = *--pfrom;
2119
2120 store_op1 (op, loc, arg);
2121 }
2122
2123
2124 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2125
2126 static void
2127 insert_op2 (op, loc, arg1, arg2, end)
/* [<][>][^][v][top][bottom][index][help] */
2128 re_opcode_t op;
2129 unsigned char *loc;
2130 int arg1, arg2;
2131 unsigned char *end;
2132 {
2133 register unsigned char *pfrom = end;
2134 register unsigned char *pto = end + 5;
2135
2136 while (pfrom != loc)
2137 *--pto = *--pfrom;
2138
2139 store_op2 (op, loc, arg1, arg2);
2140 }
2141
2142
2143 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
2144 after an alternative or a begin-subexpression. We assume there is at
2145 least one character before the ^. */
2146
2147 static boolean
2148 at_begline_loc_p (pattern, p, syntax)
/* [<][>][^][v][top][bottom][index][help] */
2149 const char *pattern, *p;
2150 reg_syntax_t syntax;
2151 {
2152 const char *prev = p - 2;
2153 boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\';
2154
2155 return
2156 /* After a subexpression? */
2157 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
2158 /* After an alternative? */
2159 || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
2160 }
2161
2162
2163 /* The dual of at_begline_loc_p. This one is for $. We assume there is
2164 at least one character after the $, i.e., `P < PEND'. */
2165
2166 static boolean
2167 at_endline_loc_p (p, pend, syntax)
/* [<][>][^][v][top][bottom][index][help] */
2168 const char *p, *pend;
2169 int syntax;
2170 {
2171 const char *next = p;
2172 boolean next_backslash = *next == '\\';
2173 const char *next_next = p + 1 < pend ? p + 1 : NULL;
2174
2175 return
2176 /* Before a subexpression? */
2177 (syntax & RE_NO_BK_PARENS ? *next == ')'
2178 : next_backslash && next_next && *next_next == ')')
2179 /* Before an alternative? */
2180 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2181 : next_backslash && next_next && *next_next == '|');
2182 }
2183
2184
2185 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2186 false if it's not. */
2187
2188 static boolean
2189 group_in_compile_stack (compile_stack, regnum)
/* [<][>][^][v][top][bottom][index][help] */
2190 compile_stack_type compile_stack;
2191 regnum_t regnum;
2192 {
2193 int this_element;
2194
2195 for (this_element = compile_stack.avail - 1;
2196 this_element >= 0;
2197 this_element--)
2198 if (compile_stack.stack[this_element].regnum == regnum)
2199 return true;
2200
2201 return false;
2202 }
2203
2204
2205 /* Read the ending character of a range (in a bracket expression) from the
2206 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2207 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2208 Then we set the translation of all bits between the starting and
2209 ending characters (inclusive) in the compiled pattern B.
2210
2211 Return an error code.
2212
2213 We use these short variable names so we can use the same macros as
2214 `regex_compile' itself. */
2215
2216 static reg_errcode_t
2217 compile_range (p_ptr, pend, translate, syntax, b)
/* [<][>][^][v][top][bottom][index][help] */
2218 const char **p_ptr, *pend;
2219 char *translate;
2220 reg_syntax_t syntax;
2221 unsigned char *b;
2222 {
2223 unsigned this_char;
2224
2225 const char *p = *p_ptr;
2226 int range_start, range_end;
2227
2228 if (p == pend)
2229 return REG_ERANGE;
2230
2231 /* Even though the pattern is a signed `char *', we need to fetch
2232 with unsigned char *'s; if the high bit of the pattern character
2233 is set, the range endpoints will be negative if we fetch using a
2234 signed char *.
2235
2236 We also want to fetch the endpoints without translating them; the
2237 appropriate translation is done in the bit-setting loop below. */
2238 range_start = ((unsigned char *) p)[-2];
2239 range_end = ((unsigned char *) p)[0];
2240
2241 /* Have to increment the pointer into the pattern string, so the
2242 caller isn't still at the ending character. */
2243 (*p_ptr)++;
2244
2245 /* If the start is after the end, the range is empty. */
2246 if (range_start > range_end)
2247 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2248
2249 /* Here we see why `this_char' has to be larger than an `unsigned
2250 char' -- the range is inclusive, so if `range_end' == 0xff
2251 (assuming 8-bit characters), we would otherwise go into an infinite
2252 loop, since all characters <= 0xff. */
2253 for (this_char = range_start; this_char <= range_end; this_char++)
2254 {
2255 SET_LIST_BIT (TRANSLATE (this_char));
2256 }
2257
2258 return REG_NOERROR;
2259 }
2260
2261 /* Failure stack declarations and macros; both re_compile_fastmap and
2262 re_match_2 use a failure stack. These have to be macros because of
2263 REGEX_ALLOCATE. */
2264
2265
2266 /* Number of failure points for which to initially allocate space
2267 when matching. If this number is exceeded, we allocate more
2268 space, so it is not a hard limit. */
2269 #ifndef INIT_FAILURE_ALLOC
2270 #define INIT_FAILURE_ALLOC 5
2271 #endif
2272
2273 /* Roughly the maximum number of failure points on the stack. Would be
2274 exactly that if always used MAX_FAILURE_SPACE each time we failed.
2275 This is a variable only so users of regex can assign to it; we never
2276 change it ourselves. */
2277 int re_max_failures = 2000;
2278
2279 typedef const unsigned char *fail_stack_elt_t;
2280
2281 typedef struct
2282 {
2283 fail_stack_elt_t *stack;
2284 unsigned size;
2285 unsigned avail; /* Offset of next open position. */
2286 } fail_stack_type;
2287
2288 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
/* [<][>][^][v][top][bottom][index][help] */
2289 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
/* [<][>][^][v][top][bottom][index][help] */
2290 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
/* [<][>][^][v][top][bottom][index][help] */
2291 #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail])
/* [<][>][^][v][top][bottom][index][help] */
2292
2293
2294 /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */
2295
2296 #define INIT_FAIL_STACK() \
/* [<][>][^][v][top][bottom][index][help] */
2297 do { \
2298 fail_stack.stack = (fail_stack_elt_t *) \
2299 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
2300 \
2301 if (fail_stack.stack == NULL) \
2302 return -2; \
2303 \
2304 fail_stack.size = INIT_FAILURE_ALLOC; \
2305 fail_stack.avail = 0; \
2306 } while (0)
2307
2308
2309 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
2310
2311 Return 1 if succeeds, and 0 if either ran out of memory
2312 allocating space for it or it was already too large.
2313
2314 REGEX_REALLOCATE requires `destination' be declared. */
2315
2316 #define DOUBLE_FAIL_STACK(fail_stack) \
/* [<][>][^][v][top][bottom][index][help] */
2317 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
2318 ? 0 \
2319 : ((fail_stack).stack = (fail_stack_elt_t *) \
2320 REGEX_REALLOCATE ((fail_stack).stack, \
2321 (fail_stack).size * sizeof (fail_stack_elt_t), \
2322 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
2323 \
2324 (fail_stack).stack == NULL \
2325 ? 0 \
2326 : ((fail_stack).size <<= 1, \
2327 1)))
2328
2329
2330 /* Push PATTERN_OP on FAIL_STACK.
2331
2332 Return 1 if was able to do so and 0 if ran out of memory allocating
2333 space to do so. */
2334 #define PUSH_PATTERN_OP(pattern_op, fail_stack) \
/* [<][>][^][v][top][bottom][index][help] */
2335 ((FAIL_STACK_FULL () \
2336 && !DOUBLE_FAIL_STACK (fail_stack)) \
2337 ? 0 \
2338 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \
2339 1))
2340
2341 /* This pushes an item onto the failure stack. Must be a four-byte
2342 value. Assumes the variable `fail_stack'. Probably should only
2343 be called from within `PUSH_FAILURE_POINT'. */
2344 #define PUSH_FAILURE_ITEM(item) \
/* [<][>][^][v][top][bottom][index][help] */
2345 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item
2346
2347 /* The complement operation. Assumes `fail_stack' is nonempty. */
2348 #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail]
/* [<][>][^][v][top][bottom][index][help] */
2349
2350 /* Used to omit pushing failure point id's when we're not debugging. */
2351 #ifdef DEBUG
2352 #define DEBUG_PUSH PUSH_FAILURE_ITEM
2353 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM ()
/* [<][>][^][v][top][bottom][index][help] */
2354 #else
2355 #define DEBUG_PUSH(item)
/* [<][>][^][v][top][bottom][index][help] */
2356 #define DEBUG_POP(item_addr)
/* [<][>][^][v][top][bottom][index][help] */
2357 #endif
2358
2359
2360 /* Push the information about the state we will need
2361 if we ever fail back to it.
2362
2363 Requires variables fail_stack, regstart, regend, reg_info, and
2364 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
2365 declared.
2366
2367 Does `return FAILURE_CODE' if runs out of memory. */
2368
2369 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
/* [<][>][^][v][top][bottom][index][help] */
2370 do { \
2371 char *destination; \
2372 /* Must be int, so when we don't save any registers, the arithmetic \
2373 of 0 + -1 isn't done as unsigned. */ \
2374 int this_reg; \
2375 \
2376 DEBUG_STATEMENT (failure_id++); \
2377 DEBUG_STATEMENT (nfailure_points_pushed++); \
2378 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
2379 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
2380 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
2381 \
2382 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
2383 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
2384 \
2385 /* Ensure we have enough space allocated for what we will push. */ \
2386 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
2387 { \
2388 if (!DOUBLE_FAIL_STACK (fail_stack)) \
2389 return failure_code; \
2390 \
2391 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
2392 (fail_stack).size); \
2393 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
2394 } \
2395 \
2396 /* Push the info, starting with the registers. */ \
2397 DEBUG_PRINT1 ("\n"); \
2398 \
2399 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
2400 this_reg++) \
2401 { \
2402 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
2403 DEBUG_STATEMENT (num_regs_pushed++); \
2404 \
2405 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2406 PUSH_FAILURE_ITEM (regstart[this_reg]); \
2407 \
2408 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2409 PUSH_FAILURE_ITEM (regend[this_reg]); \
2410 \
2411 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
2412 DEBUG_PRINT2 (" match_null=%d", \
2413 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
2414 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
2415 DEBUG_PRINT2 (" matched_something=%d", \
2416 MATCHED_SOMETHING (reg_info[this_reg])); \
2417 DEBUG_PRINT2 (" ever_matched=%d", \
2418 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
2419 DEBUG_PRINT1 ("\n"); \
2420 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \
2421 } \
2422 \
2423 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
2424 PUSH_FAILURE_ITEM (lowest_active_reg); \
2425 \
2426 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
2427 PUSH_FAILURE_ITEM (highest_active_reg); \
2428 \
2429 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
2430 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
2431 PUSH_FAILURE_ITEM (pattern_place); \
2432 \
2433 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
2434 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
2435 size2); \
2436 DEBUG_PRINT1 ("'\n"); \
2437 PUSH_FAILURE_ITEM (string_place); \
2438 \
2439 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
2440 DEBUG_PUSH (failure_id); \
2441 } while (0)
2442
2443 /* This is the number of items that are pushed and popped on the stack
2444 for each register. */
2445 #define NUM_REG_ITEMS 3
2446
2447 /* Individual items aside from the registers. */
2448 #ifdef DEBUG
2449 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
2450 #else
2451 #define NUM_NONREG_ITEMS 4
2452 #endif
2453
2454 /* We push at most this many items on the stack. */
2455 #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
2456
2457 /* We actually push this many items. */
2458 #define NUM_FAILURE_ITEMS \
2459 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
2460 + NUM_NONREG_ITEMS)
2461
2462 /* How many items can still be added to the stack without overflowing it. */
2463 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
2464
2465
2466 /* Pops what PUSH_FAIL_STACK pushes.
2467
2468 We restore into the parameters, all of which should be lvalues:
2469 STR -- the saved data position.
2470 PAT -- the saved pattern position.
2471 LOW_REG, HIGH_REG -- the highest and lowest active registers.
2472 REGSTART, REGEND -- arrays of string positions.
2473 REG_INFO -- array of information about each subexpression.
2474
2475 Also assumes the variables `fail_stack' and (if debugging), `bufp',
2476 `pend', `string1', `size1', `string2', and `size2'. */
2477
2478 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
/* [<][>][^][v][top][bottom][index][help] */
2479 { \
2480 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
2481 int this_reg; \
2482 const unsigned char *string_temp; \
2483 \
2484 assert (!FAIL_STACK_EMPTY ()); \
2485 \
2486 /* Remove failure points and point to how many regs pushed. */ \
2487 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
2488 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
2489 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
2490 \
2491 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
2492 \
2493 DEBUG_POP (&failure_id); \
2494 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
2495 \
2496 /* If the saved string location is NULL, it came from an \
2497 on_failure_keep_string_jump opcode, and we want to throw away the \
2498 saved NULL, thus retaining our current position in the string. */ \
2499 string_temp = POP_FAILURE_ITEM (); \
2500 if (string_temp != NULL) \
2501 str = (const char *) string_temp; \
2502 \
2503 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
2504 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
2505 DEBUG_PRINT1 ("'\n"); \
2506 \
2507 pat = (unsigned char *) POP_FAILURE_ITEM (); \
2508 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
2509 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
2510 \
2511 /* Restore register info. */ \
2512 high_reg = (unsigned) POP_FAILURE_ITEM (); \
2513 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
2514 \
2515 low_reg = (unsigned) POP_FAILURE_ITEM (); \
2516 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
2517 \
2518 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
2519 { \
2520 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
2521 \
2522 reg_info[this_reg].word = POP_FAILURE_ITEM (); \
2523 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
2524 \
2525 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2526 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2527 \
2528 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2529 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2530 } \
2531 \
2532 DEBUG_STATEMENT (nfailure_points_popped++); \
2533 } /* POP_FAILURE_POINT */
2534
2535 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2536 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2537 characters can start a string that matches the pattern. This fastmap
2538 is used by re_search to skip quickly over impossible starting points.
2539
2540 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2541 area as BUFP->fastmap.
2542
2543 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2544 the pattern buffer.
2545
2546 Returns 0 if we succeed, -2 if an internal error. */
2547
2548 int
2549 re_compile_fastmap (bufp)
/* [<][>][^][v][top][bottom][index][help] */
2550 struct re_pattern_buffer *bufp;
2551 {
2552 int j, k;
2553 fail_stack_type fail_stack;
2554 #ifndef REGEX_MALLOC
2555 char *destination;
2556 #endif
2557 /* We don't push any register information onto the failure stack. */
2558 unsigned num_regs = 0;
2559
2560 register char *fastmap = bufp->fastmap;
2561 unsigned char *pattern = bufp->buffer;
2562 unsigned long size = bufp->used;
2563 const unsigned char *p = pattern;
2564 register unsigned char *pend = pattern + size;
2565
2566 /* Assume that each path through the pattern can be null until
2567 proven otherwise. We set this false at the bottom of switch
2568 statement, to which we get only if a particular path doesn't
2569 match the empty string. */
2570 boolean path_can_be_null = true;
2571
2572 /* We aren't doing a `succeed_n' to begin with. */
2573 boolean succeed_n_p = false;
2574
2575 assert (fastmap != NULL && p != NULL);
2576
2577 INIT_FAIL_STACK ();
2578 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
2579 bufp->fastmap_accurate = 1; /* It will be when we're done. */
2580 bufp->can_be_null = 0;
2581
2582 while (p != pend || !FAIL_STACK_EMPTY ())
2583 {
2584 if (p == pend)
2585 {
2586 bufp->can_be_null |= path_can_be_null;
2587
2588 /* Reset for next path. */
2589 path_can_be_null = true;
2590
2591 p = fail_stack.stack[--fail_stack.avail];
2592 }
2593
2594 /* We should never be about to go beyond the end of the pattern. */
2595 assert (p < pend);
2596
2597 #ifdef SWITCH_ENUM_BUG
2598 switch ((int) ((re_opcode_t) *p++))
2599 #else
2600 switch ((re_opcode_t) *p++)
2601 #endif
2602 {
2603
2604 /* I guess the idea here is to simply not bother with a fastmap
2605 if a backreference is used, since it's too hard to figure out
2606 the fastmap for the corresponding group. Setting
2607 `can_be_null' stops `re_search_2' from using the fastmap, so
2608 that is all we do. */
2609 case duplicate:
2610 bufp->can_be_null = 1;
2611 return 0;
2612
2613
2614 /* Following are the cases which match a character. These end
2615 with `break'. */
2616
2617 case exactn:
2618 fastmap[p[1]] = 1;
2619 break;
2620
2621
2622 case charset:
2623 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2624 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2625 fastmap[j] = 1;
2626 break;
2627
2628
2629 case charset_not:
2630 /* Chars beyond end of map must be allowed. */
2631 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2632 fastmap[j] = 1;
2633
2634 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2635 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2636 fastmap[j] = 1;
2637 break;
2638
2639
2640 case wordchar:
2641 for (j = 0; j < (1 << BYTEWIDTH); j++)
2642 if (SYNTAX (j) == Sword)
2643 fastmap[j] = 1;
2644 break;
2645
2646
2647 case notwordchar:
2648 for (j = 0; j < (1 << BYTEWIDTH); j++)
2649 if (SYNTAX (j) != Sword)
2650 fastmap[j] = 1;
2651 break;
2652
2653
2654 case anychar:
2655 /* `.' matches anything ... */
2656 for (j = 0; j < (1 << BYTEWIDTH); j++)
2657 fastmap[j] = 1;
2658
2659 /* ... except perhaps newline. */
2660 if (!(bufp->syntax & RE_DOT_NEWLINE))
2661 fastmap['\n'] = 0;
2662
2663 /* Return if we have already set `can_be_null'; if we have,
2664 then the fastmap is irrelevant. Something's wrong here. */
2665 else if (bufp->can_be_null)
2666 return 0;
2667
2668 /* Otherwise, have to check alternative paths. */
2669 break;
2670
2671
2672 #ifdef emacs
2673 case syntaxspec:
2674 k = *p++;
2675 for (j = 0; j < (1 << BYTEWIDTH); j++)
2676 if (SYNTAX (j) == (enum syntaxcode) k)
2677 fastmap[j] = 1;
2678 break;
2679
2680
2681 case notsyntaxspec:
2682 k = *p++;
2683 for (j = 0; j < (1 << BYTEWIDTH); j++)
2684 if (SYNTAX (j) != (enum syntaxcode) k)
2685 fastmap[j] = 1;
2686 break;
2687
2688
2689 /* All cases after this match the empty string. These end with
2690 `continue'. */
2691
2692
2693 case before_dot:
2694 case at_dot:
2695 case after_dot:
2696 continue;
2697 #endif /* not emacs */
2698
2699
2700 case no_op:
2701 case begline:
2702 case endline:
2703 case begbuf:
2704 case endbuf:
2705 case wordbound:
2706 case notwordbound:
2707 case wordbeg:
2708 case wordend:
2709 case push_dummy_failure:
2710 continue;
2711
2712
2713 case jump_n:
2714 case pop_failure_jump:
2715 case maybe_pop_jump:
2716 case jump:
2717 case jump_past_alt:
2718 case dummy_failure_jump:
2719 EXTRACT_NUMBER_AND_INCR (j, p);
2720 p += j;
2721 if (j > 0)
2722 continue;
2723
2724 /* Jump backward implies we just went through the body of a
2725 loop and matched nothing. Opcode jumped to should be
2726 `on_failure_jump' or `succeed_n'. Just treat it like an
2727 ordinary jump. For a * loop, it has pushed its failure
2728 point already; if so, discard that as redundant. */
2729 if ((re_opcode_t) *p != on_failure_jump
2730 && (re_opcode_t) *p != succeed_n)
2731 continue;
2732
2733 p++;
2734 EXTRACT_NUMBER_AND_INCR (j, p);
2735 p += j;
2736
2737 /* If what's on the stack is where we are now, pop it. */
2738 if (!FAIL_STACK_EMPTY ()
2739 && fail_stack.stack[fail_stack.avail - 1] == p)
2740 fail_stack.avail--;
2741
2742 continue;
2743
2744
2745 case on_failure_jump:
2746 case on_failure_keep_string_jump:
2747 handle_on_failure_jump:
2748 EXTRACT_NUMBER_AND_INCR (j, p);
2749
2750 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
2751 end of the pattern. We don't want to push such a point,
2752 since when we restore it above, entering the switch will
2753 increment `p' past the end of the pattern. We don't need
2754 to push such a point since we obviously won't find any more
2755 fastmap entries beyond `pend'. Such a pattern can match
2756 the null string, though. */
2757 if (p + j < pend)
2758 {
2759 if (!PUSH_PATTERN_OP (p + j, fail_stack))
2760 return -2;
2761 }
2762 else
2763 bufp->can_be_null = 1;
2764
2765 if (succeed_n_p)
2766 {
2767 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
2768 succeed_n_p = false;
2769 }
2770
2771 continue;
2772
2773
2774 case succeed_n:
2775 /* Get to the number of times to succeed. */
2776 p += 2;
2777
2778 /* Increment p past the n for when k != 0. */
2779 EXTRACT_NUMBER_AND_INCR (k, p);
2780 if (k == 0)
2781 {
2782 p -= 4;
2783 succeed_n_p = true; /* Spaghetti code alert. */
2784 goto handle_on_failure_jump;
2785 }
2786 continue;
2787
2788
2789 case set_number_at:
2790 p += 4;
2791 continue;
2792
2793
2794 case start_memory:
2795 case stop_memory:
2796 p += 2;
2797 continue;
2798
2799
2800 default:
2801 abort (); /* We have listed all the cases. */
2802 } /* switch *p++ */
2803
2804 /* Getting here means we have found the possible starting
2805 characters for one path of the pattern -- and that the empty
2806 string does not match. We need not follow this path further.
2807 Instead, look at the next alternative (remembered on the
2808 stack), or quit if no more. The test at the top of the loop
2809 does these things. */
2810 path_can_be_null = false;
2811 p = pend;
2812 } /* while p */
2813
2814 /* Set `can_be_null' for the last path (also the first path, if the
2815 pattern is empty). */
2816 bufp->can_be_null |= path_can_be_null;
2817 return 0;
2818 } /* re_compile_fastmap */
2819
2820 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
2821 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
2822 this memory for recording register information. STARTS and ENDS
2823 must be allocated using the malloc library routine, and must each
2824 be at least NUM_REGS * sizeof (regoff_t) bytes long.
2825
2826 If NUM_REGS == 0, then subsequent matches should allocate their own
2827 register data.
2828
2829 Unless this function is called, the first search or match using
2830 PATTERN_BUFFER will allocate its own register data, without
2831 freeing the old data. */
2832
2833 void
2834 re_set_registers (bufp, regs, num_regs, starts, ends)
/* [<][>][^][v][top][bottom][index][help] */
2835 struct re_pattern_buffer *bufp;
2836 struct re_registers *regs;
2837 unsigned num_regs;
2838 regoff_t *starts, *ends;
2839 {
2840 if (num_regs)
2841 {
2842 bufp->regs_allocated = REGS_REALLOCATE;
2843 regs->num_regs = num_regs;
2844 regs->start = starts;
2845 regs->end = ends;
2846 }
2847 else
2848 {
2849 bufp->regs_allocated = REGS_UNALLOCATED;
2850 regs->num_regs = 0;
2851 regs->start = regs->end = (regoff_t) 0;
2852 }
2853 }
2854
2855 /* Searching routines. */
2856
2857 /* Like re_search_2, below, but only one string is specified, and
2858 doesn't let you say where to stop matching. */
2859
2860 int
2861 re_search (bufp, string, size, startpos, range, regs)
/* [<][>][^][v][top][bottom][index][help] */
2862 struct re_pattern_buffer *bufp;
2863 const char *string;
2864 int size, startpos, range;
2865 struct re_registers *regs;
2866 {
2867 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
2868 regs, size);
2869 }
2870
2871
2872 /* Using the compiled pattern in BUFP->buffer, first tries to match the
2873 virtual concatenation of STRING1 and STRING2, starting first at index
2874 STARTPOS, then at STARTPOS + 1, and so on.
2875
2876 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
2877
2878 RANGE is how far to scan while trying to match. RANGE = 0 means try
2879 only at STARTPOS; in general, the last start tried is STARTPOS +
2880 RANGE.
2881
2882 In REGS, return the indices of the virtual concatenation of STRING1
2883 and STRING2 that matched the entire BUFP->buffer and its contained
2884 subexpressions.
2885
2886 Do not consider matching one past the index STOP in the virtual
2887 concatenation of STRING1 and STRING2.
2888
2889 We return either the position in the strings at which the match was
2890 found, -1 if no match, or -2 if error (such as failure
2891 stack overflow). */
2892
2893 int
2894 re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
/* [<][>][^][v][top][bottom][index][help] */
2895 struct re_pattern_buffer *bufp;
2896 const char *string1, *string2;
2897 int size1, size2;
2898 int startpos;
2899 int range;
2900 struct re_registers *regs;
2901 int stop;
2902 {
2903 int val;
2904 register char *fastmap = bufp->fastmap;
2905 register char *translate = bufp->translate;
2906 int total_size = size1 + size2;
2907 int endpos = startpos + range;
2908
2909 /* Check for out-of-range STARTPOS. */
2910 if (startpos < 0 || startpos > total_size)
2911 return -1;
2912
2913 /* Fix up RANGE if it might eventually take us outside
2914 the virtual concatenation of STRING1 and STRING2. */
2915 if (endpos < -1)
2916 range = -1 - startpos;
2917 else if (endpos > total_size)
2918 range = total_size - startpos;
2919
2920 /* If the search isn't to be a backwards one, don't waste time in a
2921 search for a pattern that must be anchored. */
2922 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
2923 {
2924 if (startpos > 0)
2925 return -1;
2926 else
2927 range = 1;
2928 }
2929
2930 /* Update the fastmap now if not correct already. */
2931 if (fastmap && !bufp->fastmap_accurate)
2932 if (re_compile_fastmap (bufp) == -2)
2933 return -2;
2934
2935 /* Loop through the string, looking for a place to start matching. */
2936 for (;;)
2937 {
2938 /* If a fastmap is supplied, skip quickly over characters that
2939 cannot be the start of a match. If the pattern can match the
2940 null string, however, we don't need to skip characters; we want
2941 the first null string. */
2942 if (fastmap && startpos < total_size && !bufp->can_be_null)
2943 {
2944 if (range > 0) /* Searching forwards. */
2945 {
2946 register const char *d;
2947 register int lim = 0;
2948 int irange = range;
2949
2950 if (startpos < size1 && startpos + range >= size1)
2951 lim = range - (size1 - startpos);
2952
2953 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
2954
2955 /* Written out as an if-else to avoid testing `translate'
2956 inside the loop. */
2957 if (translate)
2958 while (range > lim
2959 && !fastmap[(unsigned char)
2960 translate[(unsigned char) *d++]])
2961 range--;
2962 else
2963 while (range > lim && !fastmap[(unsigned char) *d++])
2964 range--;
2965
2966 startpos += irange - range;
2967 }
2968 else /* Searching backwards. */
2969 {
2970 register char c = (size1 == 0 || startpos >= size1
2971 ? string2[startpos - size1]
2972 : string1[startpos]);
2973
2974 if (!fastmap[(unsigned char) TRANSLATE (c)])
2975 goto advance;
2976 }
2977 }
2978
2979 /* If can't match the null string, and that's all we have left, fail. */
2980 if (range >= 0 && startpos == total_size && fastmap
2981 && !bufp->can_be_null)
2982 return -1;
2983
2984 val = re_match_2 (bufp, string1, size1, string2, size2,
2985 startpos, regs, stop);
2986 if (val >= 0)
2987 return startpos;
2988
2989 if (val == -2)
2990 return -2;
2991
2992 advance:
2993 if (!range)
2994 break;
2995 else if (range > 0)
2996 {
2997 range--;
2998 startpos++;
2999 }
3000 else
3001 {
3002 range++;
3003 startpos--;
3004 }
3005 }
3006 return -1;
3007 } /* re_search_2 */
3008
3009 /* Declarations and macros for re_match_2. */
3010
3011 static int bcmp_translate ();
3012 static boolean alt_match_null_string_p (),
3013 common_op_match_null_string_p (),
3014 group_match_null_string_p ();
3015
3016 /* Structure for per-register (a.k.a. per-group) information.
3017 This must not be longer than one word, because we push this value
3018 onto the failure stack. Other register information, such as the
3019 starting and ending positions (which are addresses), and the list of
3020 inner groups (which is a bits list) are maintained in separate
3021 variables.
3022
3023 We are making a (strictly speaking) nonportable assumption here: that
3024 the compiler will pack our bit fields into something that fits into
3025 the type of `word', i.e., is something that fits into one item on the
3026 failure stack. */
3027 typedef union
3028 {
3029 fail_stack_elt_t word;
3030 struct
3031 {
3032 /* This field is one if this group can match the empty string,
3033 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
3034 #define MATCH_NULL_UNSET_VALUE 3
3035 unsigned match_null_string_p : 2;
3036 unsigned is_active : 1;
3037 unsigned matched_something : 1;
3038 unsigned ever_matched_something : 1;
3039 } bits;
3040 } register_info_type;
3041
3042 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
/* [<][>][^][v][top][bottom][index][help] */
3043 #define IS_ACTIVE(R) ((R).bits.is_active)
/* [<][>][^][v][top][bottom][index][help] */
3044 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
/* [<][>][^][v][top][bottom][index][help] */
3045 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
/* [<][>][^][v][top][bottom][index][help] */
3046
3047
3048 /* Call this when have matched a real character; it sets `matched' flags
3049 for the subexpressions which we are currently inside. Also records
3050 that those subexprs have matched. */
3051 #define SET_REGS_MATCHED() \
/* [<][>][^][v][top][bottom][index][help] */
3052 do \
3053 { \
3054 unsigned r; \
3055 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
3056 { \
3057 MATCHED_SOMETHING (reg_info[r]) \
3058 = EVER_MATCHED_SOMETHING (reg_info[r]) \
3059 = 1; \
3060 } \
3061 } \
3062 while (0)
3063
3064
3065 /* This converts PTR, a pointer into one of the search strings `string1'
3066 and `string2' into an offset from the beginning of that string. */
3067 #define POINTER_TO_OFFSET(ptr) \
/* [<][>][^][v][top][bottom][index][help] */
3068 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1)
3069
3070 /* Registers are set to a sentinel when they haven't yet matched. */
3071 #define REG_UNSET_VALUE ((char *) -1)
3072 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
/* [<][>][^][v][top][bottom][index][help] */
3073
3074
3075 /* Macros for dealing with the split strings in re_match_2. */
3076
3077 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3078
3079 /* Call before fetching a character with *d. This switches over to
3080 string2 if necessary. */
3081 #define PREFETCH() \
/* [<][>][^][v][top][bottom][index][help] */
3082 while (d == dend) \
3083 { \
3084 /* End of string2 => fail. */ \
3085 if (dend == end_match_2) \
3086 goto fail; \
3087 /* End of string1 => advance to string2. */ \
3088 d = string2; \
3089 dend = end_match_2; \
3090 }
3091
3092
3093 /* Test if at very beginning or at very end of the virtual concatenation
3094 of `string1' and `string2'. If only one string, it's `string2'. */
3095 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
/* [<][>][^][v][top][bottom][index][help] */
3096 #define AT_STRINGS_END(d) ((d) == end2)
/* [<][>][^][v][top][bottom][index][help] */
3097
3098
3099 /* Test if D points to a character which is word-constituent. We have
3100 two special cases to check for: if past the end of string1, look at
3101 the first character in string2; and if before the beginning of
3102 string2, look at the last character in string1. */
3103 #define WORDCHAR_P(d) \
/* [<][>][^][v][top][bottom][index][help] */
3104 (SYNTAX ((d) == end1 ? *string2 \
3105 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3106 == Sword)
3107
3108 /* Test if the character before D and the one at D differ with respect
3109 to being word-constituent. */
3110 #define AT_WORD_BOUNDARY(d) \
/* [<][>][^][v][top][bottom][index][help] */
3111 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3112 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3113
3114
3115 /* Free everything we malloc. */
3116 #ifdef REGEX_MALLOC
3117 #define FREE_VAR(var) if (var) free (var); var = NULL
/* [<][>][^][v][top][bottom][index][help] */
3118 #define FREE_VARIABLES() \
/* [<][>][^][v][top][bottom][index][help] */
3119 do { \
3120 FREE_VAR (fail_stack.stack); \
3121 FREE_VAR (regstart); \
3122 FREE_VAR (regend); \
3123 FREE_VAR (old_regstart); \
3124 FREE_VAR (old_regend); \
3125 FREE_VAR (best_regstart); \
3126 FREE_VAR (best_regend); \
3127 FREE_VAR (reg_info); \
3128 FREE_VAR (reg_dummy); \
3129 FREE_VAR (reg_info_dummy); \
3130 } while (0)
3131 #else /* not REGEX_MALLOC */
3132 /* Some MIPS systems (at least) want this to free alloca'd storage. */
3133 #define FREE_VARIABLES() alloca (0)
/* [<][>][^][v][top][bottom][index][help] */
3134 #endif /* not REGEX_MALLOC */
3135
3136
3137 /* These values must meet several constraints. They must not be valid
3138 register values; since we have a limit of 255 registers (because
3139 we use only one byte in the pattern for the register number), we can
3140 use numbers larger than 255. They must differ by 1, because of
3141 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3142 be larger than the value for the highest register, so we do not try
3143 to actually save any registers when none are active. */
3144 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3145 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3146
3147 /* Matching routines. */
3148
3149 #ifndef emacs /* Emacs never uses this. */
3150 /* re_match is like re_match_2 except it takes only a single string. */
3151
3152 int
3153 re_match (bufp, string, size, pos, regs)
/* [<][>][^][v][top][bottom][index][help] */
3154 struct re_pattern_buffer *bufp;
3155 const char *string;
3156 int size, pos;
3157 struct re_registers *regs;
3158 {
3159 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size);
3160 }
3161 #endif /* not emacs */
3162
3163
3164 /* re_match_2 matches the compiled pattern in BUFP against the
3165 the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
3166 and SIZE2, respectively). We start matching at POS, and stop
3167 matching at STOP.
3168
3169 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3170 store offsets for the substring each group matched in REGS. See the
3171 documentation for exactly how many groups we fill.
3172
3173 We return -1 if no match, -2 if an internal error (such as the
3174 failure stack overflowing). Otherwise, we return the length of the
3175 matched substring. */
3176
3177 int
3178 re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
/* [<][>][^][v][top][bottom][index][help] */
3179 struct re_pattern_buffer *bufp;
3180 const char *string1, *string2;
3181 int size1, size2;
3182 int pos;
3183 struct re_registers *regs;
3184 int stop;
3185 {
3186 /* General temporaries. */
3187 int mcnt;
3188 unsigned char *p1;
3189
3190 /* Just past the end of the corresponding string. */
3191 const char *end1, *end2;
3192
3193 /* Pointers into string1 and string2, just past the last characters in
3194 each to consider matching. */
3195 const char *end_match_1, *end_match_2;
3196
3197 /* Where we are in the data, and the end of the current string. */
3198 const char *d, *dend;
3199
3200 /* Where we are in the pattern, and the end of the pattern. */
3201 unsigned char *p = bufp->buffer;
3202 register unsigned char *pend = p + bufp->used;
3203
3204 /* We use this to map every character in the string. */
3205 char *translate = bufp->translate;
3206
3207 /* Failure point stack. Each place that can handle a failure further
3208 down the line pushes a failure point on this stack. It consists of
3209 restart, regend, and reg_info for all registers corresponding to
3210 the subexpressions we're currently inside, plus the number of such
3211 registers, and, finally, two char *'s. The first char * is where
3212 to resume scanning the pattern; the second one is where to resume
3213 scanning the strings. If the latter is zero, the failure point is
3214 a ``dummy''; if a failure happens and the failure point is a dummy,
3215 it gets discarded and the next next one is tried. */
3216 fail_stack_type fail_stack;
3217 #ifdef DEBUG
3218 static unsigned failure_id = 0;
3219 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3220 #endif
3221
3222 /* We fill all the registers internally, independent of what we
3223 return, for use in backreferences. The number here includes
3224 an element for register zero. */
3225 unsigned num_regs = bufp->re_nsub + 1;
3226
3227 /* The currently active registers. */
3228 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3229 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3230
3231 /* Information on the contents of registers. These are pointers into
3232 the input strings; they record just what was matched (on this
3233 attempt) by a subexpression part of the pattern, that is, the
3234 regnum-th regstart pointer points to where in the pattern we began
3235 matching and the regnum-th regend points to right after where we
3236 stopped matching the regnum-th subexpression. (The zeroth register
3237 keeps track of what the whole pattern matches.) */
3238 const char **regstart, **regend;
3239
3240 /* If a group that's operated upon by a repetition operator fails to
3241 match anything, then the register for its start will need to be
3242 restored because it will have been set to wherever in the string we
3243 are when we last see its open-group operator. Similarly for a
3244 register's end. */
3245 const char **old_regstart, **old_regend;
3246
3247 /* The is_active field of reg_info helps us keep track of which (possibly
3248 nested) subexpressions we are currently in. The matched_something
3249 field of reg_info[reg_num] helps us tell whether or not we have
3250 matched any of the pattern so far this time through the reg_num-th
3251 subexpression. These two fields get reset each time through any
3252 loop their register is in. */
3253 register_info_type *reg_info;
3254
3255 /* The following record the register info as found in the above
3256 variables when we find a match better than any we've seen before.
3257 This happens as we backtrack through the failure points, which in
3258 turn happens only if we have not yet matched the entire string. */
3259 unsigned best_regs_set = false;
3260 const char **best_regstart, **best_regend;
3261
3262 /* Logically, this is `best_regend[0]'. But we don't want to have to
3263 allocate space for that if we're not allocating space for anything
3264 else (see below). Also, we never need info about register 0 for
3265 any of the other register vectors, and it seems rather a kludge to
3266 treat `best_regend' differently than the rest. So we keep track of
3267 the end of the best match so far in a separate variable. We
3268 initialize this to NULL so that when we backtrack the first time
3269 and need to test it, it's not garbage. */
3270 const char *match_end = NULL;
3271
3272 /* Used when we pop values we don't care about. */
3273 const char **reg_dummy;
3274 register_info_type *reg_info_dummy;
3275
3276 #ifdef DEBUG
3277 /* Counts the total number of registers pushed. */
3278 unsigned num_regs_pushed = 0;
3279 #endif
3280
3281 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3282
3283 INIT_FAIL_STACK ();
3284
3285 /* Do not bother to initialize all the register variables if there are
3286 no groups in the pattern, as it takes a fair amount of time. If
3287 there are groups, we include space for register 0 (the whole
3288 pattern), even though we never use it, since it simplifies the
3289 array indexing. We should fix this. */
3290 if (bufp->re_nsub)
3291 {
3292 regstart = REGEX_TALLOC (num_regs, const char *);
3293 regend = REGEX_TALLOC (num_regs, const char *);
3294 old_regstart = REGEX_TALLOC (num_regs, const char *);
3295 old_regend = REGEX_TALLOC (num_regs, const char *);
3296 best_regstart = REGEX_TALLOC (num_regs, const char *);
3297 best_regend = REGEX_TALLOC (num_regs, const char *);
3298 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3299 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3300 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3301
3302 if (!(regstart && regend && old_regstart && old_regend && reg_info
3303 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3304 {
3305 FREE_VARIABLES ();
3306 return -2;
3307 }
3308 }
3309 #ifdef REGEX_MALLOC
3310 else
3311 {
3312 /* We must initialize all our variables to NULL, so that
3313 `FREE_VARIABLES' doesn't try to free them. */
3314 regstart = regend = old_regstart = old_regend = best_regstart
3315 = best_regend = reg_dummy = NULL;
3316 reg_info = reg_info_dummy = (register_info_type *) NULL;
3317 }
3318 #endif /* REGEX_MALLOC */
3319
3320 /* The starting position is bogus. */
3321 if (pos < 0 || pos > size1 + size2)
3322 {
3323 FREE_VARIABLES ();
3324 return -1;
3325 }
3326
3327 /* Initialize subexpression text positions to -1 to mark ones that no
3328 start_memory/stop_memory has been seen for. Also initialize the
3329 register information struct. */
3330 for (mcnt = 1; mcnt < num_regs; mcnt++)
3331 {
3332 regstart[mcnt] = regend[mcnt]
3333 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3334
3335 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3336 IS_ACTIVE (reg_info[mcnt]) = 0;
3337 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3338 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3339 }
3340
3341 /* We move `string1' into `string2' if the latter's empty -- but not if
3342 `string1' is null. */
3343 if (size2 == 0 && string1 != NULL)
3344 {
3345 string2 = string1;
3346 size2 = size1;
3347 string1 = 0;
3348 size1 = 0;
3349 }
3350 end1 = string1 + size1;
3351 end2 = string2 + size2;
3352
3353 /* Compute where to stop matching, within the two strings. */
3354 if (stop <= size1)
3355 {
3356 end_match_1 = string1 + stop;
3357 end_match_2 = string2;
3358 }
3359 else
3360 {
3361 end_match_1 = end1;
3362 end_match_2 = string2 + stop - size1;
3363 }
3364
3365 /* `p' scans through the pattern as `d' scans through the data.
3366 `dend' is the end of the input string that `d' points within. `d'
3367 is advanced into the following input string whenever necessary, but
3368 this happens before fetching; therefore, at the beginning of the
3369 loop, `d' can be pointing at the end of a string, but it cannot
3370 equal `string2'. */
3371 if (size1 > 0 && pos <= size1)
3372 {
3373 d = string1 + pos;
3374 dend = end_match_1;
3375 }
3376 else
3377 {
3378 d = string2 + pos - size1;
3379 dend = end_match_2;
3380 }
3381
3382 DEBUG_PRINT1 ("The compiled pattern is: ");
3383 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3384 DEBUG_PRINT1 ("The string to match is: `");
3385 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3386 DEBUG_PRINT1 ("'\n");
3387
3388 /* This loops over pattern commands. It exits by returning from the
3389 function if the match is complete, or it drops through if the match
3390 fails at this starting point in the input data. */
3391 for (;;)
3392 {
3393 DEBUG_PRINT2 ("\n0x%x: ", p);
3394
3395 if (p == pend)
3396 { /* End of pattern means we might have succeeded. */
3397 DEBUG_PRINT1 ("end of pattern ... ");
3398
3399 /* If we haven't matched the entire string, and we want the
3400 longest match, try backtracking. */
3401 if (d != end_match_2)
3402 {
3403 DEBUG_PRINT1 ("backtracking.\n");
3404
3405 if (!FAIL_STACK_EMPTY ())
3406 { /* More failure points to try. */
3407 boolean same_str_p = (FIRST_STRING_P (match_end)
3408 == MATCHING_IN_FIRST_STRING);
3409
3410 /* If exceeds best match so far, save it. */
3411 if (!best_regs_set
3412 || (same_str_p && d > match_end)
3413 || (!same_str_p && !MATCHING_IN_FIRST_STRING))
3414 {
3415 best_regs_set = true;
3416 match_end = d;
3417
3418 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3419
3420 for (mcnt = 1; mcnt < num_regs; mcnt++)
3421 {
3422 best_regstart[mcnt] = regstart[mcnt];
3423 best_regend[mcnt] = regend[mcnt];
3424 }
3425 }
3426 goto fail;
3427 }
3428
3429 /* If no failure points, don't restore garbage. */
3430 else if (best_regs_set)
3431 {
3432 restore_best_regs:
3433 /* Restore best match. It may happen that `dend ==
3434 end_match_1' while the restored d is in string2.
3435 For example, the pattern `x.*y.*z' against the
3436 strings `x-' and `y-z-', if the two strings are
3437 not consecutive in memory. */
3438 DEBUG_PRINT1 ("Restoring best registers.\n");
3439
3440 d = match_end;
3441 dend = ((d >= string1 && d <= end1)
3442 ? end_match_1 : end_match_2);
3443
3444 for (mcnt = 1; mcnt < num_regs; mcnt++)
3445 {
3446 regstart[mcnt] = best_regstart[mcnt];
3447 regend[mcnt] = best_regend[mcnt];
3448 }
3449 }
3450 } /* d != end_match_2 */
3451
3452 DEBUG_PRINT1 ("Accepting match.\n");
3453
3454 /* If caller wants register contents data back, do it. */
3455 if (regs && !bufp->no_sub)
3456 {
3457 /* Have the register data arrays been allocated? */
3458 if (bufp->regs_allocated == REGS_UNALLOCATED)
3459 { /* No. So allocate them with malloc. We need one
3460 extra element beyond `num_regs' for the `-1' marker
3461 GNU code uses. */
3462 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3463 regs->start = TALLOC (regs->num_regs, regoff_t);
3464 regs->end = TALLOC (regs->num_regs, regoff_t);
3465 if (regs->start == NULL || regs->end == NULL)
3466 return -2;
3467 bufp->regs_allocated = REGS_REALLOCATE;
3468 }
3469 else if (bufp->regs_allocated == REGS_REALLOCATE)
3470 { /* Yes. If we need more elements than were already
3471 allocated, reallocate them. If we need fewer, just
3472 leave it alone. */
3473 if (regs->num_regs < num_regs + 1)
3474 {
3475 regs->num_regs = num_regs + 1;
3476 RETALLOC (regs->start, regs->num_regs, regoff_t);
3477 RETALLOC (regs->end, regs->num_regs, regoff_t);
3478 if (regs->start == NULL || regs->end == NULL)
3479 return -2;
3480 }
3481 }
3482 else
3483 assert (bufp->regs_allocated == REGS_FIXED);
3484
3485 /* Convert the pointer data in `regstart' and `regend' to
3486 indices. Register zero has to be set differently,
3487 since we haven't kept track of any info for it. */
3488 if (regs->num_regs > 0)
3489 {
3490 regs->start[0] = pos;
3491 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1
3492 : d - string2 + size1);
3493 }
3494
3495 /* Go through the first `min (num_regs, regs->num_regs)'
3496 registers, since that is all we initialized. */
3497 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
3498 {
3499 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
3500 regs->start[mcnt] = regs->end[mcnt] = -1;
3501 else
3502 {
3503 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]);
3504 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]);
3505 }
3506 }
3507
3508 /* If the regs structure we return has more elements than
3509 were in the pattern, set the extra elements to -1. If
3510 we (re)allocated the registers, this is the case,
3511 because we always allocate enough to have at least one
3512 -1 at the end. */
3513 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
3514 regs->start[mcnt] = regs->end[mcnt] = -1;
3515 } /* regs && !bufp->no_sub */
3516
3517 FREE_VARIABLES ();
3518 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
3519 nfailure_points_pushed, nfailure_points_popped,
3520 nfailure_points_pushed - nfailure_points_popped);
3521 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
3522
3523 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
3524 ? string1
3525 : string2 - size1);
3526
3527 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
3528
3529 return mcnt;
3530 }
3531
3532 /* Otherwise match next pattern command. */
3533 #ifdef SWITCH_ENUM_BUG
3534 switch ((int) ((re_opcode_t) *p++))
3535 #else
3536 switch ((re_opcode_t) *p++)
3537 #endif
3538 {
3539 /* Ignore these. Used to ignore the n of succeed_n's which
3540 currently have n == 0. */
3541 case no_op:
3542 DEBUG_PRINT1 ("EXECUTING no_op.\n");
3543 break;
3544
3545
3546 /* Match the next n pattern characters exactly. The following
3547 byte in the pattern defines n, and the n bytes after that
3548 are the characters to match. */
3549 case exactn:
3550 mcnt = *p++;
3551 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
3552
3553 /* This is written out as an if-else so we don't waste time
3554 testing `translate' inside the loop. */
3555 if (translate)
3556 {
3557 do
3558 {
3559 PREFETCH ();
3560 if (translate[(unsigned char) *d++] != (char) *p++)
3561 goto fail;
3562 }
3563 while (--mcnt);
3564 }
3565 else
3566 {
3567 do
3568 {
3569 PREFETCH ();
3570 if (*d++ != (char) *p++) goto fail;
3571 }
3572 while (--mcnt);
3573 }
3574 SET_REGS_MATCHED ();
3575 break;
3576
3577
3578 /* Match any character except possibly a newline or a null. */
3579 case anychar:
3580 DEBUG_PRINT1 ("EXECUTING anychar.\n");
3581
3582 PREFETCH ();
3583
3584 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
3585 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
3586 goto fail;
3587
3588 SET_REGS_MATCHED ();
3589 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
3590 d++;
3591 break;
3592
3593
3594 case charset:
3595 case charset_not:
3596 {
3597 register unsigned char c;
3598 boolean not = (re_opcode_t) *(p - 1) == charset_not;
3599
3600 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
3601
3602 PREFETCH ();
3603 c = TRANSLATE (*d); /* The character to match. */
3604
3605 /* Cast to `unsigned' instead of `unsigned char' in case the
3606 bit list is a full 32 bytes long. */
3607 if (c < (unsigned) (*p * BYTEWIDTH)
3608 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3609 not = !not;
3610
3611 p += 1 + *p;
3612
3613 if (!not) goto fail;
3614
3615 SET_REGS_MATCHED ();
3616 d++;
3617 break;
3618 }
3619
3620
3621 /* The beginning of a group is represented by start_memory.
3622 The arguments are the register number in the next byte, and the
3623 number of groups inner to this one in the next. The text
3624 matched within the group is recorded (in the internal
3625 registers data structure) under the register number. */
3626 case start_memory:
3627 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
3628
3629 /* Find out if this group can match the empty string. */
3630 p1 = p; /* To send to group_match_null_string_p. */
3631
3632 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
3633 REG_MATCH_NULL_STRING_P (reg_info[*p])
3634 = group_match_null_string_p (&p1, pend, reg_info);
3635
3636 /* Save the position in the string where we were the last time
3637 we were at this open-group operator in case the group is
3638 operated upon by a repetition operator, e.g., with `(a*)*b'
3639 against `ab'; then we want to ignore where we are now in
3640 the string in case this attempt to match fails. */
3641 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3642 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
3643 : regstart[*p];
3644 DEBUG_PRINT2 (" old_regstart: %d\n",
3645 POINTER_TO_OFFSET (old_regstart[*p]));
3646
3647 regstart[*p] = d;
3648 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
3649
3650 IS_ACTIVE (reg_info[*p]) = 1;
3651 MATCHED_SOMETHING (reg_info[*p]) = 0;
3652
3653 /* This is the new highest active register. */
3654 highest_active_reg = *p;
3655
3656 /* If nothing was active before, this is the new lowest active
3657 register. */
3658 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3659 lowest_active_reg = *p;
3660
3661 /* Move past the register number and inner group count. */
3662 p += 2;
3663 break;
3664
3665
3666 /* The stop_memory opcode represents the end of a group. Its
3667 arguments are the same as start_memory's: the register
3668 number, and the number of inner groups. */
3669 case stop_memory:
3670 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
3671
3672 /* We need to save the string position the last time we were at
3673 this close-group operator in case the group is operated
3674 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
3675 against `aba'; then we want to ignore where we are now in
3676 the string in case this attempt to match fails. */
3677 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3678 ? REG_UNSET (regend[*p]) ? d : regend[*p]
3679 : regend[*p];
3680 DEBUG_PRINT2 (" old_regend: %d\n",
3681 POINTER_TO_OFFSET (old_regend[*p]));
3682
3683 regend[*p] = d;
3684 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
3685
3686 /* This register isn't active anymore. */
3687 IS_ACTIVE (reg_info[*p]) = 0;
3688
3689 /* If this was the only register active, nothing is active
3690 anymore. */
3691 if (lowest_active_reg == highest_active_reg)
3692 {
3693 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3694 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3695 }
3696 else
3697 { /* We must scan for the new highest active register, since
3698 it isn't necessarily one less than now: consider
3699 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
3700 new highest active register is 1. */
3701 unsigned char r = *p - 1;
3702 while (r > 0 && !IS_ACTIVE (reg_info[r]))
3703 r--;
3704
3705 /* If we end up at register zero, that means that we saved
3706 the registers as the result of an `on_failure_jump', not
3707 a `start_memory', and we jumped to past the innermost
3708 `stop_memory'. For example, in ((.)*) we save
3709 registers 1 and 2 as a result of the *, but when we pop
3710 back to the second ), we are at the stop_memory 1.
3711 Thus, nothing is active. */
3712 if (r == 0)
3713 {
3714 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3715 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3716 }
3717 else
3718 highest_active_reg = r;
3719 }
3720
3721 /* If just failed to match something this time around with a
3722 group that's operated on by a repetition operator, try to
3723 force exit from the ``loop'', and restore the register
3724 information for this group that we had before trying this
3725 last match. */
3726 if ((!MATCHED_SOMETHING (reg_info[*p])
3727 || (re_opcode_t) p[-3] == start_memory)
3728 && (p + 2) < pend)
3729 {
3730 boolean is_a_jump_n = false;
3731
3732 p1 = p + 2;
3733 mcnt = 0;
3734 switch ((re_opcode_t) *p1++)
3735 {
3736 case jump_n:
3737 is_a_jump_n = true;
3738 case pop_failure_jump:
3739 case maybe_pop_jump:
3740 case jump:
3741 case dummy_failure_jump:
3742 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3743 if (is_a_jump_n)
3744 p1 += 2;
3745 break;
3746
3747 default:
3748 /* do nothing */ ;
3749 }
3750 p1 += mcnt;
3751
3752 /* If the next operation is a jump backwards in the pattern
3753 to an on_failure_jump right before the start_memory
3754 corresponding to this stop_memory, exit from the loop
3755 by forcing a failure after pushing on the stack the
3756 on_failure_jump's jump in the pattern, and d. */
3757 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
3758 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
3759 {
3760 /* If this group ever matched anything, then restore
3761 what its registers were before trying this last
3762 failed match, e.g., with `(a*)*b' against `ab' for
3763 regstart[1], and, e.g., with `((a*)*(b*)*)*'
3764 against `aba' for regend[3].
3765
3766 Also restore the registers for inner groups for,
3767 e.g., `((a*)(b*))*' against `aba' (register 3 would
3768 otherwise get trashed). */
3769
3770 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
3771 {
3772 unsigned r;
3773
3774 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
3775
3776 /* Restore this and inner groups' (if any) registers. */
3777 for (r = *p; r < *p + *(p + 1); r++)
3778 {
3779 regstart[r] = old_regstart[r];
3780
3781 /* xx why this test? */
3782 if ((int) old_regend[r] >= (int) regstart[r])
3783 regend[r] = old_regend[r];
3784 }
3785 }
3786 p1++;
3787 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3788 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
3789
3790 goto fail;
3791 }
3792 }
3793
3794 /* Move past the register number and the inner group count. */
3795 p += 2;
3796 break;
3797
3798
3799 /* \<digit> has been turned into a `duplicate' command which is
3800 followed by the numeric value of <digit> as the register number. */
3801 case duplicate:
3802 {
3803 register const char *d2, *dend2;
3804 int regno = *p++; /* Get which register to match against. */
3805 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
3806
3807 /* Can't back reference a group which we've never matched. */
3808 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
3809 goto fail;
3810
3811 /* Where in input to try to start matching. */
3812 d2 = regstart[regno];
3813
3814 /* Where to stop matching; if both the place to start and
3815 the place to stop matching are in the same string, then
3816 set to the place to stop, otherwise, for now have to use
3817 the end of the first string. */
3818
3819 dend2 = ((FIRST_STRING_P (regstart[regno])
3820 == FIRST_STRING_P (regend[regno]))
3821 ? regend[regno] : end_match_1);
3822 for (;;)
3823 {
3824 /* If necessary, advance to next segment in register
3825 contents. */
3826 while (d2 == dend2)
3827 {
3828 if (dend2 == end_match_2) break;
3829 if (dend2 == regend[regno]) break;
3830
3831 /* End of string1 => advance to string2. */
3832 d2 = string2;
3833 dend2 = regend[regno];
3834 }
3835 /* At end of register contents => success */
3836 if (d2 == dend2) break;
3837
3838 /* If necessary, advance to next segment in data. */
3839 PREFETCH ();
3840
3841 /* How many characters left in this segment to match. */
3842 mcnt = dend - d;
3843
3844 /* Want how many consecutive characters we can match in
3845 one shot, so, if necessary, adjust the count. */
3846 if (mcnt > dend2 - d2)
3847 mcnt = dend2 - d2;
3848
3849 /* Compare that many; failure if mismatch, else move
3850 past them. */
3851 if (translate
3852 ? bcmp_translate (d, d2, mcnt, translate)
3853 : bcmp (d, d2, mcnt))
3854 goto fail;
3855 d += mcnt, d2 += mcnt;
3856 }
3857 }
3858 break;
3859
3860
3861 /* begline matches the empty string at the beginning of the string
3862 (unless `not_bol' is set in `bufp'), and, if
3863 `newline_anchor' is set, after newlines. */
3864 case begline:
3865 DEBUG_PRINT1 ("EXECUTING begline.\n");
3866
3867 if (AT_STRINGS_BEG (d))
3868 {
3869 if (!bufp->not_bol) break;
3870 }
3871 else if (d[-1] == '\n' && bufp->newline_anchor)
3872 {
3873 break;
3874 }
3875 /* In all other cases, we fail. */
3876 goto fail;
3877
3878
3879 /* endline is the dual of begline. */
3880 case endline:
3881 DEBUG_PRINT1 ("EXECUTING endline.\n");
3882
3883 if (AT_STRINGS_END (d))
3884 {
3885 if (!bufp->not_eol) break;
3886 }
3887
3888 /* We have to ``prefetch'' the next character. */
3889 else if ((d == end1 ? *string2 : *d) == '\n'
3890 && bufp->newline_anchor)
3891 {
3892 break;
3893 }
3894 goto fail;
3895
3896
3897 /* Match at the very beginning of the data. */
3898 case begbuf:
3899 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
3900 if (AT_STRINGS_BEG (d))
3901 break;
3902 goto fail;
3903
3904
3905 /* Match at the very end of the data. */
3906 case endbuf:
3907 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
3908 if (AT_STRINGS_END (d))
3909 break;
3910 goto fail;
3911
3912
3913 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
3914 pushes NULL as the value for the string on the stack. Then
3915 `pop_failure_point' will keep the current value for the
3916 string, instead of restoring it. To see why, consider
3917 matching `foo\nbar' against `.*\n'. The .* matches the foo;
3918 then the . fails against the \n. But the next thing we want
3919 to do is match the \n against the \n; if we restored the
3920 string value, we would be back at the foo.
3921
3922 Because this is used only in specific cases, we don't need to
3923 check all the things that `on_failure_jump' does, to make
3924 sure the right things get saved on the stack. Hence we don't
3925 share its code. The only reason to push anything on the
3926 stack at all is that otherwise we would have to change
3927 `anychar's code to do something besides goto fail in this
3928 case; that seems worse than this. */
3929 case on_failure_keep_string_jump:
3930 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
3931
3932 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3933 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
3934
3935 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
3936 break;
3937
3938
3939 /* Uses of on_failure_jump:
3940
3941 Each alternative starts with an on_failure_jump that points
3942 to the beginning of the next alternative. Each alternative
3943 except the last ends with a jump that in effect jumps past
3944 the rest of the alternatives. (They really jump to the
3945 ending jump of the following alternative, because tensioning
3946 these jumps is a hassle.)
3947
3948 Repeats start with an on_failure_jump that points past both
3949 the repetition text and either the following jump or
3950 pop_failure_jump back to this on_failure_jump. */
3951 case on_failure_jump:
3952 on_failure:
3953 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
3954
3955 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3956 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
3957
3958 /* If this on_failure_jump comes right before a group (i.e.,
3959 the original * applied to a group), save the information
3960 for that group and all inner ones, so that if we fail back
3961 to this point, the group's information will be correct.
3962 For example, in \(a*\)*\1, we need the preceding group,
3963 and in \(\(a*\)b*\)\2, we need the inner group. */
3964
3965 /* We can't use `p' to check ahead because we push
3966 a failure point to `p + mcnt' after we do this. */
3967 p1 = p;
3968
3969 /* We need to skip no_op's before we look for the
3970 start_memory in case this on_failure_jump is happening as
3971 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
3972 against aba. */
3973 while (p1 < pend && (re_opcode_t) *p1 == no_op)
3974 p1++;
3975
3976 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
3977 {
3978 /* We have a new highest active register now. This will
3979 get reset at the start_memory we are about to get to,
3980 but we will have saved all the registers relevant to
3981 this repetition op, as described above. */
3982 highest_active_reg = *(p1 + 1) + *(p1 + 2);
3983 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3984 lowest_active_reg = *(p1 + 1);
3985 }
3986
3987 DEBUG_PRINT1 (":\n");
3988 PUSH_FAILURE_POINT (p + mcnt, d, -2);
3989 break;
3990
3991
3992 /* A smart repeat ends with `maybe_pop_jump'.
3993 We change it to either `pop_failure_jump' or `jump'. */
3994 case maybe_pop_jump:
3995 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3996 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
3997 {
3998 register unsigned char *p2 = p;
3999
4000 /* Compare the beginning of the repeat with what in the
4001 pattern follows its end. If we can establish that there
4002 is nothing that they would both match, i.e., that we
4003 would have to backtrack because of (as in, e.g., `a*a')
4004 then we can change to pop_failure_jump, because we'll
4005 never have to backtrack.
4006
4007 This is not true in the case of alternatives: in
4008 `(a|ab)*' we do need to backtrack to the `ab' alternative
4009 (e.g., if the string was `ab'). But instead of trying to
4010 detect that here, the alternative has put on a dummy
4011 failure point which is what we will end up popping. */
4012
4013 /* Skip over open/close-group commands. */
4014 while (p2 + 2 < pend
4015 && ((re_opcode_t) *p2 == stop_memory
4016 || (re_opcode_t) *p2 == start_memory))
4017 p2 += 3; /* Skip over args, too. */
4018
4019 /* If we're at the end of the pattern, we can change. */
4020 if (p2 == pend)
4021 {
4022 /* Consider what happens when matching ":\(.*\)"
4023 against ":/". I don't really understand this code
4024 yet. */
4025 p[-3] = (unsigned char) pop_failure_jump;
4026 DEBUG_PRINT1
4027 (" End of pattern: change to `pop_failure_jump'.\n");
4028 }
4029
4030 else if ((re_opcode_t) *p2 == exactn
4031 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4032 {
4033 register unsigned char c
4034 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4035 p1 = p + mcnt;
4036
4037 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4038 to the `maybe_finalize_jump' of this case. Examine what
4039 follows. */
4040 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4041 {
4042 p[-3] = (unsigned char) pop_failure_jump;
4043 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4044 c, p1[5]);
4045 }
4046
4047 else if ((re_opcode_t) p1[3] == charset
4048 || (re_opcode_t) p1[3] == charset_not)
4049 {
4050 int not = (re_opcode_t) p1[3] == charset_not;
4051
4052 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4053 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4054 not = !not;
4055
4056 /* `not' is equal to 1 if c would match, which means
4057 that we can't change to pop_failure_jump. */
4058 if (!not)
4059 {
4060 p[-3] = (unsigned char) pop_failure_jump;
4061 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4062 }
4063 }
4064 }
4065 }
4066 p -= 2; /* Point at relative address again. */
4067 if ((re_opcode_t) p[-1] != pop_failure_jump)
4068 {
4069 p[-1] = (unsigned char) jump;
4070 DEBUG_PRINT1 (" Match => jump.\n");
4071 goto unconditional_jump;
4072 }
4073 /* Note fall through. */
4074
4075
4076 /* The end of a simple repeat has a pop_failure_jump back to
4077 its matching on_failure_jump, where the latter will push a
4078 failure point. The pop_failure_jump takes off failure
4079 points put on by this pop_failure_jump's matching
4080 on_failure_jump; we got through the pattern to here from the
4081 matching on_failure_jump, so didn't fail. */
4082 case pop_failure_jump:
4083 {
4084 /* We need to pass separate storage for the lowest and
4085 highest registers, even though we don't care about the
4086 actual values. Otherwise, we will restore only one
4087 register from the stack, since lowest will == highest in
4088 `pop_failure_point'. */
4089 unsigned dummy_low_reg, dummy_high_reg;
4090 unsigned char *pdummy;
4091 const char *sdummy;
4092
4093 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4094 POP_FAILURE_POINT (sdummy, pdummy,
4095 dummy_low_reg, dummy_high_reg,
4096 reg_dummy, reg_dummy, reg_info_dummy);
4097 }
4098 /* Note fall through. */
4099
4100
4101 /* Unconditionally jump (without popping any failure points). */
4102 case jump:
4103 unconditional_jump:
4104 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4105 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4106 p += mcnt; /* Do the jump. */
4107 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4108 break;
4109
4110
4111 /* We need this opcode so we can detect where alternatives end
4112 in `group_match_null_string_p' et al. */
4113 case jump_past_alt:
4114 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4115 goto unconditional_jump;
4116
4117
4118 /* Normally, the on_failure_jump pushes a failure point, which
4119 then gets popped at pop_failure_jump. We will end up at
4120 pop_failure_jump, also, and with a pattern of, say, `a+', we
4121 are skipping over the on_failure_jump, so we have to push
4122 something meaningless for pop_failure_jump to pop. */
4123 case dummy_failure_jump:
4124 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4125 /* It doesn't matter what we push for the string here. What
4126 the code at `fail' tests is the value for the pattern. */
4127 PUSH_FAILURE_POINT (0, 0, -2);
4128 goto unconditional_jump;
4129
4130
4131 /* At the end of an alternative, we need to push a dummy failure
4132 point in case we are followed by a `pop_failure_jump', because
4133 we don't want the failure point for the alternative to be
4134 popped. For example, matching `(a|ab)*' against `aab'
4135 requires that we match the `ab' alternative. */
4136 case push_dummy_failure:
4137 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4138 /* See comments just above at `dummy_failure_jump' about the
4139 two zeroes. */
4140 PUSH_FAILURE_POINT (0, 0, -2);
4141 break;
4142
4143 /* Have to succeed matching what follows at least n times.
4144 After that, handle like `on_failure_jump'. */
4145 case succeed_n:
4146 EXTRACT_NUMBER (mcnt, p + 2);
4147 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4148
4149 assert (mcnt >= 0);
4150 /* Originally, this is how many times we HAVE to succeed. */
4151 if (mcnt > 0)
4152 {
4153 mcnt--;
4154 p += 2;
4155 STORE_NUMBER_AND_INCR (p, mcnt);
4156 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
4157 }
4158 else if (mcnt == 0)
4159 {
4160 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4161 p[2] = (unsigned char) no_op;
4162 p[3] = (unsigned char) no_op;
4163 goto on_failure;
4164 }
4165 break;
4166
4167 case jump_n:
4168 EXTRACT_NUMBER (mcnt, p + 2);
4169 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4170
4171 /* Originally, this is how many times we CAN jump. */
4172 if (mcnt)
4173 {
4174 mcnt--;
4175 STORE_NUMBER (p + 2, mcnt);
4176 goto unconditional_jump;
4177 }
4178 /* If don't have to jump any more, skip over the rest of command. */
4179 else
4180 p += 4;
4181 break;
4182
4183 case set_number_at:
4184 {
4185 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4186
4187 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4188 p1 = p + mcnt;
4189 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4190 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4191 STORE_NUMBER (p1, mcnt);
4192 break;
4193 }
4194
4195 case wordbound:
4196 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4197 if (AT_WORD_BOUNDARY (d))
4198 break;
4199 goto fail;
4200
4201 case notwordbound:
4202 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4203 if (AT_WORD_BOUNDARY (d))
4204 goto fail;
4205 break;
4206
4207 case wordbeg:
4208 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4209 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4210 break;
4211 goto fail;
4212
4213 case wordend:
4214 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4215 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4216 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4217 break;
4218 goto fail;
4219
4220 #ifdef emacs
4221 #ifdef emacs19
4222 case before_dot:
4223 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4224 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
4225 goto fail;
4226 break;
4227
4228 case at_dot:
4229 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4230 if (PTR_CHAR_POS ((unsigned char *) d) != point)
4231 goto fail;
4232 break;
4233
4234 case after_dot:
4235 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4236 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
4237 goto fail;
4238 break;
4239 #else /* not emacs19 */
4240 case at_dot:
4241 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4242 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point)
4243 goto fail;
4244 break;
4245 #endif /* not emacs19 */
4246
4247 case syntaxspec:
4248 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4249 mcnt = *p++;
4250 goto matchsyntax;
4251
4252 case wordchar:
4253 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4254 mcnt = (int) Sword;
4255 matchsyntax:
4256 PREFETCH ();
4257 if (SYNTAX (*d++) != (enum syntaxcode) mcnt)
4258 goto fail;
4259 SET_REGS_MATCHED ();
4260 break;
4261
4262 case notsyntaxspec:
4263 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4264 mcnt = *p++;
4265 goto matchnotsyntax;
4266
4267 case notwordchar:
4268 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4269 mcnt = (int) Sword;
4270 matchnotsyntax:
4271 PREFETCH ();
4272 if (SYNTAX (*d++) == (enum syntaxcode) mcnt)
4273 goto fail;
4274 SET_REGS_MATCHED ();
4275 break;
4276
4277 #else /* not emacs */
4278 case wordchar:
4279 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4280 PREFETCH ();
4281 if (!WORDCHAR_P (d))
4282 goto fail;
4283 SET_REGS_MATCHED ();
4284 d++;
4285 break;
4286
4287 case notwordchar:
4288 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4289 PREFETCH ();
4290 if (WORDCHAR_P (d))
4291 goto fail;
4292 SET_REGS_MATCHED ();
4293 d++;
4294 break;
4295 #endif /* not emacs */
4296
4297 default:
4298 abort ();
4299 }
4300 continue; /* Successfully executed one pattern command; keep going. */
4301
4302
4303 /* We goto here if a matching operation fails. */
4304 fail:
4305 if (!FAIL_STACK_EMPTY ())
4306 { /* A restart point is known. Restore to that state. */
4307 DEBUG_PRINT1 ("\nFAIL:\n");
4308 POP_FAILURE_POINT (d, p,
4309 lowest_active_reg, highest_active_reg,
4310 regstart, regend, reg_info);
4311
4312 /* If this failure point is a dummy, try the next one. */
4313 if (!p)
4314 goto fail;
4315
4316 /* If we failed to the end of the pattern, don't examine *p. */
4317 assert (p <= pend);
4318 if (p < pend)
4319 {
4320 boolean is_a_jump_n = false;
4321
4322 /* If failed to a backwards jump that's part of a repetition
4323 loop, need to pop this failure point and use the next one. */
4324 switch ((re_opcode_t) *p)
4325 {
4326 case jump_n:
4327 is_a_jump_n = true;
4328 case maybe_pop_jump:
4329 case pop_failure_jump:
4330 case jump:
4331 p1 = p + 1;
4332 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4333 p1 += mcnt;
4334
4335 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4336 || (!is_a_jump_n
4337 && (re_opcode_t) *p1 == on_failure_jump))
4338 goto fail;
4339 break;
4340 default:
4341 /* do nothing */ ;
4342 }
4343 }
4344
4345 if (d >= string1 && d <= end1)
4346 dend = end_match_1;
4347 }
4348 else
4349 break; /* Matching at this starting point really fails. */
4350 } /* for (;;) */
4351
4352 if (best_regs_set)
4353 goto restore_best_regs;
4354
4355 FREE_VARIABLES ();
4356
4357 return -1; /* Failure to match. */
4358 } /* re_match_2 */
4359
4360 /* Subroutine definitions for re_match_2. */
4361
4362
4363 /* We are passed P pointing to a register number after a start_memory.
4364
4365 Return true if the pattern up to the corresponding stop_memory can
4366 match the empty string, and false otherwise.
4367
4368 If we find the matching stop_memory, sets P to point to one past its number.
4369 Otherwise, sets P to an undefined byte less than or equal to END.
4370
4371 We don't handle duplicates properly (yet). */
4372
4373 static boolean
4374 group_match_null_string_p (p, end, reg_info)
/* [<][>][^][v][top][bottom][index][help] */
4375 unsigned char **p, *end;
4376 register_info_type *reg_info;
4377 {
4378 int mcnt;
4379 /* Point to after the args to the start_memory. */
4380 unsigned char *p1 = *p + 2;
4381
4382 while (p1 < end)
4383 {
4384 /* Skip over opcodes that can match nothing, and return true or
4385 false, as appropriate, when we get to one that can't, or to the
4386 matching stop_memory. */
4387
4388 switch ((re_opcode_t) *p1)
4389 {
4390 /* Could be either a loop or a series of alternatives. */
4391 case on_failure_jump:
4392 p1++;
4393 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4394
4395 /* If the next operation is not a jump backwards in the
4396 pattern. */
4397
4398 if (mcnt >= 0)
4399 {
4400 /* Go through the on_failure_jumps of the alternatives,
4401 seeing if any of the alternatives cannot match nothing.
4402 The last alternative starts with only a jump,
4403 whereas the rest start with on_failure_jump and end
4404 with a jump, e.g., here is the pattern for `a|b|c':
4405
4406 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4407 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4408 /exactn/1/c
4409
4410 So, we have to first go through the first (n-1)
4411 alternatives and then deal with the last one separately. */
4412
4413
4414 /* Deal with the first (n-1) alternatives, which start
4415 with an on_failure_jump (see above) that jumps to right
4416 past a jump_past_alt. */
4417
4418 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
4419 {
4420 /* `mcnt' holds how many bytes long the alternative
4421 is, including the ending `jump_past_alt' and
4422 its number. */
4423
4424 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4425 reg_info))
4426 return false;
4427
4428 /* Move to right after this alternative, including the
4429 jump_past_alt. */
4430 p1 += mcnt;
4431
4432 /* Break if it's the beginning of an n-th alternative
4433 that doesn't begin with an on_failure_jump. */
4434 if ((re_opcode_t) *p1 != on_failure_jump)
4435 break;
4436
4437 /* Still have to check that it's not an n-th
4438 alternative that starts with an on_failure_jump. */
4439 p1++;
4440 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4441 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
4442 {
4443 /* Get to the beginning of the n-th alternative. */
4444 p1 -= 3;
4445 break;
4446 }
4447 }
4448
4449 /* Deal with the last alternative: go back and get number
4450 of the `jump_past_alt' just before it. `mcnt' contains
4451 the length of the alternative. */
4452 EXTRACT_NUMBER (mcnt, p1 - 2);
4453
4454 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4455 return false;
4456
4457 p1 += mcnt; /* Get past the n-th alternative. */
4458 } /* if mcnt > 0 */
4459 break;
4460
4461
4462 case stop_memory:
4463 assert (p1[1] == **p);
4464 *p = p1 + 2;
4465 return true;
4466
4467
4468 default:
4469 if (!common_op_match_null_string_p (&p1, end, reg_info))
4470 return false;
4471 }
4472 } /* while p1 < end */
4473
4474 return false;
4475 } /* group_match_null_string_p */
4476
4477
4478 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4479 It expects P to be the first byte of a single alternative and END one
4480 byte past the last. The alternative can contain groups. */
4481
4482 static boolean
4483 alt_match_null_string_p (p, end, reg_info)
/* [<][>][^][v][top][bottom][index][help] */
4484 unsigned char *p, *end;
4485 register_info_type *reg_info;
4486 {
4487 int mcnt;
4488 unsigned char *p1 = p;
4489
4490 while (p1 < end)
4491 {
4492 /* Skip over opcodes that can match nothing, and break when we get
4493 to one that can't. */
4494
4495 switch ((re_opcode_t) *p1)
4496 {
4497 /* It's a loop. */
4498 case on_failure_jump:
4499 p1++;
4500 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4501 p1 += mcnt;
4502 break;
4503
4504 default:
4505 if (!common_op_match_null_string_p (&p1, end, reg_info))
4506 return false;
4507 }
4508 } /* while p1 < end */
4509
4510 return true;
4511 } /* alt_match_null_string_p */
4512
4513
4514 /* Deals with the ops common to group_match_null_string_p and
4515 alt_match_null_string_p.
4516
4517 Sets P to one after the op and its arguments, if any. */
4518
4519 static boolean
4520 common_op_match_null_string_p (p, end, reg_info)
/* [<][>][^][v][top][bottom][index][help] */
4521 unsigned char **p, *end;
4522 register_info_type *reg_info;
4523 {
4524 int mcnt;
4525 boolean ret;
4526 int reg_no;
4527 unsigned char *p1 = *p;
4528
4529 switch ((re_opcode_t) *p1++)
4530 {
4531 case no_op:
4532 case begline:
4533 case endline:
4534 case begbuf:
4535 case endbuf:
4536 case wordbeg:
4537 case wordend:
4538 case wordbound:
4539 case notwordbound:
4540 #ifdef emacs
4541 case before_dot:
4542 case at_dot:
4543 case after_dot:
4544 #endif
4545 break;
4546
4547 case start_memory:
4548 reg_no = *p1;
4549 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
4550 ret = group_match_null_string_p (&p1, end, reg_info);
4551
4552 /* Have to set this here in case we're checking a group which
4553 contains a group and a back reference to it. */
4554
4555 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
4556 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
4557
4558 if (!ret)
4559 return false;
4560 break;
4561
4562 /* If this is an optimized succeed_n for zero times, make the jump. */
4563 case jump:
4564 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4565 if (mcnt >= 0)
4566 p1 += mcnt;
4567 else
4568 return false;
4569 break;
4570
4571 case succeed_n:
4572 /* Get to the number of times to succeed. */
4573 p1 += 2;
4574 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4575
4576 if (mcnt == 0)
4577 {
4578 p1 -= 4;
4579 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4580 p1 += mcnt;
4581 }
4582 else
4583 return false;
4584 break;
4585
4586 case duplicate:
4587 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
4588 return false;
4589 break;
4590
4591 case set_number_at:
4592 p1 += 4;
4593
4594 default:
4595 /* All other opcodes mean we cannot match the empty string. */
4596 return false;
4597 }
4598
4599 *p = p1;
4600 return true;
4601 } /* common_op_match_null_string_p */
4602
4603
4604 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
4605 bytes; nonzero otherwise. */
4606
4607 static int
4608 bcmp_translate (s1, s2, len, translate)
/* [<][>][^][v][top][bottom][index][help] */
4609 unsigned char *s1, *s2;
4610 register int len;
4611 char *translate;
4612 {
4613 register unsigned char *p1 = s1, *p2 = s2;
4614 while (len)
4615 {
4616 if (translate[*p1++] != translate[*p2++]) return 1;
4617 len--;
4618 }
4619 return 0;
4620 }
4621
4622 /* Entry points for GNU code. */
4623
4624 /* re_compile_pattern is the GNU regular expression compiler: it
4625 compiles PATTERN (of length SIZE) and puts the result in BUFP.
4626 Returns 0 if the pattern was valid, otherwise an error string.
4627
4628 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
4629 are set in BUFP on entry.
4630
4631 We call regex_compile to do the actual compilation. */
4632
4633 const char *
4634 re_compile_pattern (pattern, length, bufp)
/* [<][>][^][v][top][bottom][index][help] */
4635 const char *pattern;
4636 int length;
4637 struct re_pattern_buffer *bufp;
4638 {
4639 reg_errcode_t ret;
4640
4641 /* GNU code is written to assume at least RE_NREGS registers will be set
4642 (and at least one extra will be -1). */
4643 bufp->regs_allocated = REGS_UNALLOCATED;
4644
4645 /* And GNU code determines whether or not to get register information
4646 by passing null for the REGS argument to re_match, etc., not by
4647 setting no_sub. */
4648 bufp->no_sub = 0;
4649
4650 /* Match anchors at newline. */
4651 bufp->newline_anchor = 1;
4652
4653 ret = regex_compile (pattern, length, re_syntax_options, bufp);
4654
4655 return re_error_msg[(int) ret];
4656 }
4657
4658 /* Entry points compatible with 4.2 BSD regex library. We don't define
4659 them if this is an Emacs or POSIX compilation. */
4660
4661 #if !defined (emacs) && !defined (_POSIX_SOURCE)
4662
4663 /* BSD has one and only one pattern buffer. */
4664 static struct re_pattern_buffer re_comp_buf;
4665
4666 char *
4667 re_comp (s)
/* [<][>][^][v][top][bottom][index][help] */
4668 const char *s;
4669 {
4670 reg_errcode_t ret;
4671
4672 if (!s)
4673 {
4674 if (!re_comp_buf.buffer)
4675 return "No previous regular expression";
4676 return 0;
4677 }
4678
4679 if (!re_comp_buf.buffer)
4680 {
4681 re_comp_buf.buffer = (unsigned char *) malloc (200);
4682 if (re_comp_buf.buffer == NULL)
4683 return "Memory exhausted";
4684 re_comp_buf.allocated = 200;
4685
4686 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
4687 if (re_comp_buf.fastmap == NULL)
4688 return "Memory exhausted";
4689 }
4690
4691 /* Since `re_exec' always passes NULL for the `regs' argument, we
4692 don't need to initialize the pattern buffer fields which affect it. */
4693
4694 /* Match anchors at newlines. */
4695 re_comp_buf.newline_anchor = 1;
4696
4697 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
4698
4699 /* Yes, we're discarding `const' here. */
4700 return (char *) re_error_msg[(int) ret];
4701 }
4702
4703
4704 int
4705 re_exec (s)
/* [<][>][^][v][top][bottom][index][help] */
4706 const char *s;
4707 {
4708 const int len = strlen (s);
4709 return
4710 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
4711 }
4712 #endif /* not emacs and not _POSIX_SOURCE */
4713
4714 /* POSIX.2 functions. Don't define these for Emacs. */
4715
4716 #ifndef emacs
4717
4718 /* regcomp takes a regular expression as a string and compiles it.
4719
4720 PREG is a regex_t *. We do not expect any fields to be initialized,
4721 since POSIX says we shouldn't. Thus, we set
4722
4723 `buffer' to the compiled pattern;
4724 `used' to the length of the compiled pattern;
4725 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
4726 REG_EXTENDED bit in CFLAGS is set; otherwise, to
4727 RE_SYNTAX_POSIX_BASIC;
4728 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
4729 `fastmap' and `fastmap_accurate' to zero;
4730 `re_nsub' to the number of subexpressions in PATTERN.
4731
4732 PATTERN is the address of the pattern string.
4733
4734 CFLAGS is a series of bits which affect compilation.
4735
4736 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
4737 use POSIX basic syntax.
4738
4739 If REG_NEWLINE is set, then . and [^...] don't match newline.
4740 Also, regexec will try a match beginning after every newline.
4741
4742 If REG_ICASE is set, then we considers upper- and lowercase
4743 versions of letters to be equivalent when matching.
4744
4745 If REG_NOSUB is set, then when PREG is passed to regexec, that
4746 routine will report only success or failure, and nothing about the
4747 registers.
4748
4749 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
4750 the return codes and their meanings.) */
4751
4752 int
4753 regcomp (preg, pattern, cflags)
/* [<][>][^][v][top][bottom][index][help] */
4754 regex_t *preg;
4755 const char *pattern;
4756 int cflags;
4757 {
4758 reg_errcode_t ret;
4759 unsigned syntax
4760 = (cflags & REG_EXTENDED) ?
4761 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
4762
4763 /* regex_compile will allocate the space for the compiled pattern. */
4764 preg->buffer = 0;
4765 preg->allocated = 0;
4766
4767 /* Don't bother to use a fastmap when searching. This simplifies the
4768 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
4769 characters after newlines into the fastmap. This way, we just try
4770 every character. */
4771 preg->fastmap = 0;
4772
4773 if (cflags & REG_ICASE)
4774 {
4775 unsigned i;
4776
4777 preg->translate = (char *) malloc (CHAR_SET_SIZE);
4778 if (preg->translate == NULL)
4779 return (int) REG_ESPACE;
4780
4781 /* Map uppercase characters to corresponding lowercase ones. */
4782 for (i = 0; i < CHAR_SET_SIZE; i++)
4783 preg->translate[i] = ISUPPER (i) ? tolower (i) : i;
4784 }
4785 else
4786 preg->translate = NULL;
4787
4788 /* If REG_NEWLINE is set, newlines are treated differently. */
4789 if (cflags & REG_NEWLINE)
4790 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
4791 syntax &= ~RE_DOT_NEWLINE;
4792 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
4793 /* It also changes the matching behavior. */
4794 preg->newline_anchor = 1;
4795 }
4796 else
4797 preg->newline_anchor = 0;
4798
4799 preg->no_sub = !!(cflags & REG_NOSUB);
4800
4801 /* POSIX says a null character in the pattern terminates it, so we
4802 can use strlen here in compiling the pattern. */
4803 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
4804
4805 /* POSIX doesn't distinguish between an unmatched open-group and an
4806 unmatched close-group: both are REG_EPAREN. */
4807 if (ret == REG_ERPAREN) ret = REG_EPAREN;
4808
4809 return (int) ret;
4810 }
4811
4812
4813 /* regexec searches for a given pattern, specified by PREG, in the
4814 string STRING.
4815
4816 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
4817 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
4818 least NMATCH elements, and we set them to the offsets of the
4819 corresponding matched substrings.
4820
4821 EFLAGS specifies `execution flags' which affect matching: if
4822 REG_NOTBOL is set, then ^ does not match at the beginning of the
4823 string; if REG_NOTEOL is set, then $ does not match at the end.
4824
4825 We return 0 if we find a match and REG_NOMATCH if not. */
4826
4827 int
4828 regexec (preg, string, nmatch, pmatch, eflags)
/* [<][>][^][v][top][bottom][index][help] */
4829 const regex_t *preg;
4830 const char *string;
4831 size_t nmatch;
4832 regmatch_t pmatch[];
4833 int eflags;
4834 {
4835 int ret;
4836 struct re_registers regs;
4837 regex_t private_preg;
4838 int len = strlen (string);
4839 boolean want_reg_info = !preg->no_sub && nmatch > 0;
4840
4841 private_preg = *preg;
4842
4843 private_preg.not_bol = !!(eflags & REG_NOTBOL);
4844 private_preg.not_eol = !!(eflags & REG_NOTEOL);
4845
4846 /* The user has told us exactly how many registers to return
4847 information about, via `nmatch'. We have to pass that on to the
4848 matching routines. */
4849 private_preg.regs_allocated = REGS_FIXED;
4850
4851 if (want_reg_info)
4852 {
4853 regs.num_regs = nmatch;
4854 regs.start = TALLOC (nmatch, regoff_t);
4855 regs.end = TALLOC (nmatch, regoff_t);
4856 if (regs.start == NULL || regs.end == NULL)
4857 return (int) REG_NOMATCH;
4858 }
4859
4860 /* Perform the searching operation. */
4861 ret = re_search (&private_preg, string, len,
4862 /* start: */ 0, /* range: */ len,
4863 want_reg_info ? ®s : (struct re_registers *) 0);
4864
4865 /* Copy the register information to the POSIX structure. */
4866 if (want_reg_info)
4867 {
4868 if (ret >= 0)
4869 {
4870 unsigned r;
4871
4872 for (r = 0; r < nmatch; r++)
4873 {
4874 pmatch[r].rm_so = regs.start[r];
4875 pmatch[r].rm_eo = regs.end[r];
4876 }
4877 }
4878
4879 /* If we needed the temporary register info, free the space now. */
4880 free (regs.start);
4881 free (regs.end);
4882 }
4883
4884 /* We want zero return to mean success, unlike `re_search'. */
4885 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
4886 }
4887
4888
4889 /* Returns a message corresponding to an error code, ERRCODE, returned
4890 from either regcomp or regexec. We don't use PREG here. */
4891
4892 size_t
4893 regerror (errcode, preg, errbuf, errbuf_size)
/* [<][>][^][v][top][bottom][index][help] */
4894 int errcode;
4895 const regex_t *preg;
4896 char *errbuf;
4897 size_t errbuf_size;
4898 {
4899 const char *msg;
4900 size_t msg_size;
4901
4902 if (errcode < 0
4903 || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0])))
4904 /* Only error codes returned by the rest of the code should be passed
4905 to this routine. If we are given anything else, or if other regex
4906 code generates an invalid error code, then the program has a bug.
4907 Dump core so we can fix it. */
4908 abort ();
4909
4910 msg = re_error_msg[errcode];
4911
4912 /* POSIX doesn't require that we do anything in this case, but why
4913 not be nice. */
4914 if (! msg)
4915 msg = "Success";
4916
4917 msg_size = strlen (msg) + 1; /* Includes the null. */
4918
4919 if (errbuf_size != 0)
4920 {
4921 if (msg_size > errbuf_size)
4922 {
4923 strncpy (errbuf, msg, errbuf_size - 1);
4924 errbuf[errbuf_size - 1] = 0;
4925 }
4926 else
4927 strcpy (errbuf, msg);
4928 }
4929
4930 return msg_size;
4931 }
4932
4933
4934 /* Free dynamically allocated space used by PREG. */
4935
4936 void
4937 regfree (preg)
/* [<][>][^][v][top][bottom][index][help] */
4938 regex_t *preg;
4939 {
4940 if (preg->buffer != NULL)
4941 free (preg->buffer);
4942 preg->buffer = NULL;
4943
4944 preg->allocated = 0;
4945 preg->used = 0;
4946
4947 if (preg->fastmap != NULL)
4948 free (preg->fastmap);
4949 preg->fastmap = NULL;
4950 preg->fastmap_accurate = 0;
4951
4952 if (preg->translate != NULL)
4953 free (preg->translate);
4954 preg->translate = NULL;
4955 }
4956
4957 #endif /* not emacs */
4958
4959 /*
4960 Local variables:
4961 make-backup-files: t
4962 version-control: t
4963 trim-versions-without-asking: nil
4964 End:
4965 */