]> gcc.gnu.org Git - gcc.git/blame - contrib/fixinc/regex.c
Update FSF address in copyright notice.
[gcc.git] / contrib / fixinc / regex.c
CommitLineData
99024956
BK
1/* Extended regular expression matching and search library,
2 version 0.12.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
5
6 Copyright (C) 1993 Free Software Foundation, Inc.
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
aa5a7ea3
JL
20 Foundation, 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA. */
99024956
BK
22
23/* AIX requires this to be the first thing in the file. */
24#if defined (_AIX) && !defined (REGEX_MALLOC)
25 #pragma alloca
26#endif
27
38e01259 28/* $Id: regex.c,v 1.3 1999/01/11 13:25:47 law Exp $ */
99024956
BK
29
30#define _GNU_SOURCE
31
32/* We need this for `regex.h', and perhaps for the Emacs include files. */
33#ifdef __TANDEM
34/*
35 * WARN 107: constant used as logical expression
36 * Side effect of some complex macros
37 */
38# pragma NOWARN( 107 )
39
40/*
41 * WARN 95: prototype function declaration not in scope:
42 * TANDEM brain damaged C.
43 */
44# pragma NOWARN( 95 )
45# include <systype.h>
46#else
47# include <sys/types.h>
48#endif
49
50#ifdef HAVE_CONFIG_H
51#include "config.h"
52#endif
53
54/* The `emacs' switch turns on certain matching commands
55 that make sense only in Emacs. */
56#ifdef emacs
57
58#include "lisp.h"
59#include "buffer.h"
60#include "syntax.h"
61
62/* Emacs uses `NULL' as a predicate. */
63#undef NULL
64
65#else /* not emacs */
66
67#ifdef __TANDEM
68# include <string.h>
69# include <stdlib.h>
70# ifndef bcmp
71# define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
72# endif
73# ifndef bcopy
74# define bcopy(s, d, n) memcpy ((d), (s), (n))
75# endif
76# ifndef bzero
77# define bzero(s, n) memset ((s), 0, (n))
78# endif
79# pragma NOWARN( 96 )
80
81#else /* NOT TANDEM */
82
83/* We used to test for `BSTRING' here, but only GCC and Emacs define
84 `BSTRING', as far as I know, and neither of them use this code. */
85# if HAVE_STRING_H || STDC_HEADERS
86# include <string.h>
87# ifndef bcmp
88# define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
89# endif
90# ifndef bcopy
91# define bcopy(s, d, n) memcpy ((d), (s), (n))
92# endif
93# ifndef bzero
94# define bzero(s, n) memset ((s), 0, (n))
95# endif
96# else
97# include <strings.h>
98# endif
99
100# ifdef STDC_HEADERS
101# include <stdlib.h>
102# else
103 char *malloc ();
104 char *realloc ();
105# endif
106#endif /* TANDEM */
107
108/* Define the syntax stuff for \<, \>, etc. */
109
110/* This must be nonzero for the wordchar and notwordchar pattern
111 commands in re_match_2. */
112#ifndef Sword
113#define Sword 1
114#endif
115
116#ifdef SYNTAX_TABLE
117
118extern char *re_syntax_table;
119
120#else /* not SYNTAX_TABLE */
121
122/* How many characters in the character set. */
123#define CHAR_SET_SIZE 256
124
125static char re_syntax_table[CHAR_SET_SIZE];
126
127static void
128init_syntax_once ()
129{
130 register int c;
131 static int done = 0;
132
133 if (done)
134 return;
135
136 bzero (re_syntax_table, sizeof re_syntax_table);
137
138 for (c = 'a'; c <= 'z'; c++)
139 re_syntax_table[c] = Sword;
140
141 for (c = 'A'; c <= 'Z'; c++)
142 re_syntax_table[c] = Sword;
143
144 for (c = '0'; c <= '9'; c++)
145 re_syntax_table[c] = Sword;
146
147 re_syntax_table['_'] = Sword;
148
149 done = 1;
150}
151
152#endif /* not SYNTAX_TABLE */
153
154#define SYNTAX(c) re_syntax_table[c]
155
156#endif /* not emacs */
157\f
158/* Get the interface, including the syntax bits. */
159#include "regex.h"
160
161/* isalpha etc. are used for the character classes. */
162#include <ctype.h>
163
164#ifndef isascii
165#define isascii(c) 1
166#endif
167
168#ifdef isblank
169#define ISBLANK(c) (isascii (c) && isblank (c))
170#else
171#define ISBLANK(c) ((c) == ' ' || (c) == '\t')
172#endif
173#ifdef isgraph
174#define ISGRAPH(c) (isascii (c) && isgraph (c))
175#else
176#define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c))
177#endif
178
179#define ISPRINT(c) (isascii (c) && isprint (c))
180#define ISDIGIT(c) (isascii (c) && isdigit (c))
181#define ISALNUM(c) (isascii (c) && isalnum (c))
182#define ISALPHA(c) (isascii (c) && isalpha (c))
183#define ISCNTRL(c) (isascii (c) && iscntrl (c))
184#define ISLOWER(c) (isascii (c) && islower (c))
185#define ISPUNCT(c) (isascii (c) && ispunct (c))
186#define ISSPACE(c) (isascii (c) && isspace (c))
187#define ISUPPER(c) (isascii (c) && isupper (c))
188#define ISXDIGIT(c) (isascii (c) && isxdigit (c))
189
190#ifndef NULL
191#define NULL 0
192#endif
193
194/* We remove any previous definition of `SIGN_EXTEND_CHAR',
195 since ours (we hope) works properly with all combinations of
196 machines, compilers, `char' and `unsigned char' argument types.
197 (Per Bothner suggested the basic approach.) */
198#undef SIGN_EXTEND_CHAR
199#if __STDC__
200#define SIGN_EXTEND_CHAR(c) ((signed char) (c))
201#else /* not __STDC__ */
202/* As in Harbison and Steele. */
203#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
204#endif
205\f
206/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
207 use `alloca' instead of `malloc'. This is because using malloc in
208 re_search* or re_match* could cause memory leaks when C-g is used in
209 Emacs; also, malloc is slower and causes storage fragmentation. On
210 the other hand, malloc is more portable, and easier to debug.
211
212 Because we sometimes use alloca, some routines have to be macros,
213 not functions -- `alloca'-allocated space disappears at the end of the
214 function it is called in. */
215
216#ifdef __TANDEM
217#define REGEX_MALLOC 1
218#endif
219
220#ifdef REGEX_MALLOC
221
222#define REGEX_ALLOCATE malloc
223#define DESTINATION_DECL
224#define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
225
226#else /* not REGEX_MALLOC */
227
228/* Emacs already defines alloca, sometimes. */
229#ifndef alloca
230
231/* Make alloca work the best possible way. */
232#ifdef __GNUC__
233#define alloca __builtin_alloca
234#else /* not __GNUC__ */
235#if HAVE_ALLOCA_H
236#include <alloca.h>
237#else /* not __GNUC__ or HAVE_ALLOCA_H */
238#ifndef _AIX /* Already did AIX, up at the top. */
239char *alloca ();
240#endif /* not _AIX */
241#endif /* not HAVE_ALLOCA_H */
242#endif /* not __GNUC__ */
243
244#endif /* not alloca */
245
246#define REGEX_ALLOCATE alloca
247#define DESTINATION_DECL char* destination;
248
249/* Assumes a `char *destination' variable. */
250#define REGEX_REALLOCATE(source, osize, nsize) \
251 (destination = (char *) alloca (nsize), \
252 bcopy (source, destination, osize), \
253 destination)
254
255#endif /* not REGEX_MALLOC */
256
257
258/* True if `size1' is non-NULL and PTR is pointing anywhere inside
259 `string1' or just past its end. This works if PTR is NULL, which is
260 a good thing. */
261#define FIRST_STRING_P(ptr) \
262 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
263
264/* (Re)Allocate N items of type T using malloc, or fail. */
265#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
266#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
267#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
268
269#define BYTEWIDTH 8 /* In bits. */
270
271#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
272
273#define MAX(a, b) ((a) > (b) ? (a) : (b))
274#define MIN(a, b) ((a) < (b) ? (a) : (b))
275
276typedef char boolean;
277#define false 0
278#define true 1
279\f
280/* These are the command codes that appear in compiled regular
281 expressions. Some opcodes are followed by argument bytes. A
282 command code can specify any interpretation whatsoever for its
283 arguments. Zero bytes may appear in the compiled regular expression.
284
285 The value of `exactn' is needed in search.c (search_buffer) in Emacs.
286 So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
287 `exactn' we use here must also be 1. */
288
289typedef enum
290{
291 no_op = 0,
292
293 /* Followed by one byte giving n, then by n literal bytes. */
294 exactn = 1,
295
296 /* Matches any (more or less) character. */
297 anychar,
298
299 /* Matches any one char belonging to specified set. First
300 following byte is number of bitmap bytes. Then come bytes
301 for a bitmap saying which chars are in. Bits in each byte
302 are ordered low-bit-first. A character is in the set if its
303 bit is 1. A character too large to have a bit in the map is
304 automatically not in the set. */
305 charset,
306
307 /* Same parameters as charset, but match any character that is
308 not one of those specified. */
309 charset_not,
310
311 /* Start remembering the text that is matched, for storing in a
312 register. Followed by one byte with the register number, in
313 the range 0 to one less than the pattern buffer's re_nsub
314 field. Then followed by one byte with the number of groups
315 inner to this one. (This last has to be part of the
316 start_memory only because we need it in the on_failure_jump
317 of re_match_2.) */
318 start_memory,
319
320 /* Stop remembering the text that is matched and store it in a
321 memory register. Followed by one byte with the register
322 number, in the range 0 to one less than `re_nsub' in the
323 pattern buffer, and one byte with the number of inner groups,
324 just like `start_memory'. (We need the number of inner
325 groups here because we don't have any easy way of finding the
326 corresponding start_memory when we're at a stop_memory.) */
327 stop_memory,
328
329 /* Match a duplicate of something remembered. Followed by one
330 byte containing the register number. */
331 duplicate,
332
333 /* Fail unless at beginning of line. */
334 begline,
335
336 /* Fail unless at end of line. */
337 endline,
338
339 /* Succeeds if at beginning of buffer (if emacs) or at beginning
340 of string to be matched (if not). */
341 begbuf,
342
343 /* Analogously, for end of buffer/string. */
344 endbuf,
345
346 /* Followed by two byte relative address to which to jump. */
347 jump,
348
349 /* Same as jump, but marks the end of an alternative. */
350 jump_past_alt,
351
352 /* Followed by two-byte relative address of place to resume at
353 in case of failure. */
354 on_failure_jump,
355
356 /* Like on_failure_jump, but pushes a placeholder instead of the
357 current string position when executed. */
358 on_failure_keep_string_jump,
359
360 /* Throw away latest failure point and then jump to following
361 two-byte relative address. */
362 pop_failure_jump,
363
364 /* Change to pop_failure_jump if know won't have to backtrack to
365 match; otherwise change to jump. This is used to jump
366 back to the beginning of a repeat. If what follows this jump
367 clearly won't match what the repeat does, such that we can be
368 sure that there is no use backtracking out of repetitions
369 already matched, then we change it to a pop_failure_jump.
370 Followed by two-byte address. */
371 maybe_pop_jump,
372
373 /* Jump to following two-byte address, and push a dummy failure
374 point. This failure point will be thrown away if an attempt
375 is made to use it for a failure. A `+' construct makes this
376 before the first repeat. Also used as an intermediary kind
377 of jump when compiling an alternative. */
378 dummy_failure_jump,
379
380 /* Push a dummy failure point and continue. Used at the end of
381 alternatives. */
382 push_dummy_failure,
383
384 /* Followed by two-byte relative address and two-byte number n.
385 After matching N times, jump to the address upon failure. */
386 succeed_n,
387
388 /* Followed by two-byte relative address, and two-byte number n.
389 Jump to the address N times, then fail. */
390 jump_n,
391
392 /* Set the following two-byte relative address to the
393 subsequent two-byte number. The address *includes* the two
394 bytes of number. */
395 set_number_at,
396
397 wordchar, /* Matches any word-constituent character. */
398 notwordchar, /* Matches any char that is not a word-constituent. */
399
400 wordbeg, /* Succeeds if at word beginning. */
401 wordend, /* Succeeds if at word end. */
402
403 wordbound, /* Succeeds if at a word boundary. */
404 notwordbound /* Succeeds if not at a word boundary. */
405
406#ifdef emacs
407 ,before_dot, /* Succeeds if before point. */
408 at_dot, /* Succeeds if at point. */
409 after_dot, /* Succeeds if after point. */
410
411 /* Matches any character whose syntax is specified. Followed by
412 a byte which contains a syntax code, e.g., Sword. */
413 syntaxspec,
414
415 /* Matches any character whose syntax is not that specified. */
416 notsyntaxspec
417#endif /* emacs */
418} re_opcode_t;
419\f
420/* Common operations on the compiled pattern. */
421
422/* Store NUMBER in two contiguous bytes starting at DESTINATION. */
423
424#define STORE_NUMBER(destination, number) \
425 do { \
426 (destination)[0] = (unsigned char)((number) & 0377); \
427 (destination)[1] = (unsigned char)((number) >> 8); \
428 } while (0)
429
430/* Same as STORE_NUMBER, except increment DESTINATION to
431 the byte after where the number is stored. Therefore, DESTINATION
432 must be an lvalue. */
433
434#define STORE_NUMBER_AND_INCR(destination, number) \
435 do { \
436 STORE_NUMBER (destination, number); \
437 (destination) += 2; \
438 } while (0)
439
440/* Put into DESTINATION a number stored in two contiguous bytes starting
441 at SOURCE. */
442
443#define EXTRACT_NUMBER(destination, source) \
444 do { \
445 (destination) = *(source) & 0377; \
446 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
447 } while (0)
448
449#ifdef DEBUG
450static void
451extract_number (dest, source)
452 int *dest;
453 unsigned char *source;
454{
455 int temp = SIGN_EXTEND_CHAR (*(source + 1));
456 *dest = *source & 0377;
457 *dest += temp << 8;
458}
459
460#ifndef EXTRACT_MACROS /* To debug the macros. */
461#undef EXTRACT_NUMBER
462#define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
463#endif /* not EXTRACT_MACROS */
464
465#endif /* DEBUG */
466
467/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
468 SOURCE must be an lvalue. */
469
470#define EXTRACT_NUMBER_AND_INCR(destination, source) \
471 do { \
472 EXTRACT_NUMBER (destination, source); \
473 (source) += 2; \
474 } while (0)
475
476#ifdef DEBUG
477static void
478extract_number_and_incr (destination, source)
479 int *destination;
480 unsigned char **source;
481{
482 extract_number (destination, *source);
483 *source += 2;
484}
485
486#ifndef EXTRACT_MACROS
487#undef EXTRACT_NUMBER_AND_INCR
488#define EXTRACT_NUMBER_AND_INCR(dest, src) \
489 extract_number_and_incr (&dest, &src)
490#endif /* not EXTRACT_MACROS */
491
492#endif /* DEBUG */
493\f
494/* If DEBUG is defined, Regex prints many voluminous messages about what
495 it is doing (if the variable `debug' is nonzero). If linked with the
496 main program in `iregex.c', you can enter patterns and strings
497 interactively. And if linked with the main program in `main.c' and
498 the other test files, you can run the already-written tests. */
499
500#ifdef DEBUG
501
502/* We use standard I/O for debugging. */
503#include <stdio.h>
504
505/* It is useful to test things that ``must'' be true when debugging. */
506#include <assert.h>
507
508static int debug = 0;
509
510#define DEBUG_STATEMENT(e) e
511#define DEBUG_PRINT1(x) if (debug) printf (x)
512#define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
513#define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
514#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
515#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
516 if (debug) print_partial_compiled_pattern (s, e)
517#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
518 if (debug) print_double_string (w, s1, sz1, s2, sz2)
519
520
521extern void printchar ();
522
523/* Print the fastmap in human-readable form. */
524
525void
526print_fastmap (fastmap)
527 char *fastmap;
528{
529 unsigned was_a_range = 0;
530 unsigned i = 0;
531
532 while (i < (1 << BYTEWIDTH))
533 {
534 if (fastmap[i++])
535 {
536 was_a_range = 0;
537 printchar (i - 1);
538 while (i < (1 << BYTEWIDTH) && fastmap[i])
539 {
540 was_a_range = 1;
541 i++;
542 }
543 if (was_a_range)
544 {
545 printf ("-");
546 printchar (i - 1);
547 }
548 }
549 }
550 putchar ('\n');
551}
552
553
554/* Print a compiled pattern string in human-readable form, starting at
555 the START pointer into it and ending just before the pointer END. */
556
557void
558print_partial_compiled_pattern (start, end)
559 unsigned char *start;
560 unsigned char *end;
561{
562 int mcnt, mcnt2;
563 unsigned char *p = start;
564 unsigned char *pend = end;
565
566 if (start == NULL)
567 {
568 printf ("(null)\n");
569 return;
570 }
571
572 /* Loop over pattern commands. */
573 while (p < pend)
574 {
575 switch ((re_opcode_t) *p++)
576 {
577 case no_op:
578 printf ("/no_op");
579 break;
580
581 case exactn:
582 mcnt = *p++;
583 printf ("/exactn/%d", mcnt);
584 do
585 {
586 putchar ('/');
587 printchar (*p++);
588 }
589 while (--mcnt);
590 break;
591
592 case start_memory:
593 mcnt = *p++;
594 printf ("/start_memory/%d/%d", mcnt, *p++);
595 break;
596
597 case stop_memory:
598 mcnt = *p++;
599 printf ("/stop_memory/%d/%d", mcnt, *p++);
600 break;
601
602 case duplicate:
603 printf ("/duplicate/%d", *p++);
604 break;
605
606 case anychar:
607 printf ("/anychar");
608 break;
609
610 case charset:
611 case charset_not:
612 {
613 register int c;
614
615 printf ("/charset%s",
616 (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
617
618 assert (p + *p < pend);
619
620 for (c = 0; c < *p; c++)
621 {
622 unsigned bit;
623 unsigned char map_byte = p[1 + c];
624
625 putchar ('/');
626
627 for (bit = 0; bit < BYTEWIDTH; bit++)
628 if (map_byte & (1 << bit))
629 printchar (c * BYTEWIDTH + bit);
630 }
631 p += 1 + *p;
632 break;
633 }
634
635 case begline:
636 printf ("/begline");
637 break;
638
639 case endline:
640 printf ("/endline");
641 break;
642
643 case on_failure_jump:
644 extract_number_and_incr (&mcnt, &p);
645 printf ("/on_failure_jump/0/%d", mcnt);
646 break;
647
648 case on_failure_keep_string_jump:
649 extract_number_and_incr (&mcnt, &p);
650 printf ("/on_failure_keep_string_jump/0/%d", mcnt);
651 break;
652
653 case dummy_failure_jump:
654 extract_number_and_incr (&mcnt, &p);
655 printf ("/dummy_failure_jump/0/%d", mcnt);
656 break;
657
658 case push_dummy_failure:
659 printf ("/push_dummy_failure");
660 break;
661
662 case maybe_pop_jump:
663 extract_number_and_incr (&mcnt, &p);
664 printf ("/maybe_pop_jump/0/%d", mcnt);
665 break;
666
667 case pop_failure_jump:
668 extract_number_and_incr (&mcnt, &p);
669 printf ("/pop_failure_jump/0/%d", mcnt);
670 break;
671
672 case jump_past_alt:
673 extract_number_and_incr (&mcnt, &p);
674 printf ("/jump_past_alt/0/%d", mcnt);
675 break;
676
677 case jump:
678 extract_number_and_incr (&mcnt, &p);
679 printf ("/jump/0/%d", mcnt);
680 break;
681
682 case succeed_n:
683 extract_number_and_incr (&mcnt, &p);
684 extract_number_and_incr (&mcnt2, &p);
685 printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2);
686 break;
687
688 case jump_n:
689 extract_number_and_incr (&mcnt, &p);
690 extract_number_and_incr (&mcnt2, &p);
691 printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2);
692 break;
693
694 case set_number_at:
695 extract_number_and_incr (&mcnt, &p);
696 extract_number_and_incr (&mcnt2, &p);
697 printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2);
698 break;
699
700 case wordbound:
701 printf ("/wordbound");
702 break;
703
704 case notwordbound:
705 printf ("/notwordbound");
706 break;
707
708 case wordbeg:
709 printf ("/wordbeg");
710 break;
711
712 case wordend:
713 printf ("/wordend");
714
715#ifdef emacs
716 case before_dot:
717 printf ("/before_dot");
718 break;
719
720 case at_dot:
721 printf ("/at_dot");
722 break;
723
724 case after_dot:
725 printf ("/after_dot");
726 break;
727
728 case syntaxspec:
729 printf ("/syntaxspec");
730 mcnt = *p++;
731 printf ("/%d", mcnt);
732 break;
733
734 case notsyntaxspec:
735 printf ("/notsyntaxspec");
736 mcnt = *p++;
737 printf ("/%d", mcnt);
738 break;
739#endif /* emacs */
740
741 case wordchar:
742 printf ("/wordchar");
743 break;
744
745 case notwordchar:
746 printf ("/notwordchar");
747 break;
748
749 case begbuf:
750 printf ("/begbuf");
751 break;
752
753 case endbuf:
754 printf ("/endbuf");
755 break;
756
757 default:
758 printf ("?%d", *(p-1));
759 }
760 }
761 printf ("/\n");
762}
763
764
765void
766print_compiled_pattern (bufp)
767 struct re_pattern_buffer *bufp;
768{
769 unsigned char *buffer = bufp->buffer;
770
771 print_partial_compiled_pattern (buffer, buffer + bufp->used);
772 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
773
774 if (bufp->fastmap_accurate && bufp->fastmap)
775 {
776 printf ("fastmap: ");
777 print_fastmap (bufp->fastmap);
778 }
779
780 printf ("re_nsub: %d\t", bufp->re_nsub);
781 printf ("regs_alloc: %d\t", bufp->regs_allocated);
782 printf ("can_be_null: %d\t", bufp->can_be_null);
783 printf ("newline_anchor: %d\n", bufp->newline_anchor);
784 printf ("no_sub: %d\t", bufp->no_sub);
785 printf ("not_bol: %d\t", bufp->not_bol);
786 printf ("not_eol: %d\t", bufp->not_eol);
787 printf ("syntax: %d\n", bufp->syntax);
788 /* Perhaps we should print the translate table? */
789}
790
791
792void
793print_double_string (where, string1, size1, string2, size2)
794 const char *where;
795 const char *string1;
796 const char *string2;
797 int size1;
798 int size2;
799{
800 unsigned this_char;
801
802 if (where == NULL)
803 printf ("(null)");
804 else
805 {
806 if (FIRST_STRING_P (where))
807 {
808 for (this_char = where - string1; this_char < size1; this_char++)
809 printchar (string1[this_char]);
810
811 where = string2;
812 }
813
814 for (this_char = where - string2; this_char < size2; this_char++)
815 printchar (string2[this_char]);
816 }
817}
818
819#else /* not DEBUG */
820
821#undef assert
822#define assert(e)
823
824#define DEBUG_STATEMENT(e)
825#define DEBUG_PRINT1(x)
826#define DEBUG_PRINT2(x1, x2)
827#define DEBUG_PRINT3(x1, x2, x3)
828#define DEBUG_PRINT4(x1, x2, x3, x4)
829#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
830#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
831
832#endif /* not DEBUG */
833\f
834/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
835 also be assigned to arbitrarily: each pattern buffer stores its own
836 syntax, so it can be changed between regex compilations. */
837reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
838
839
840/* Specify the precise syntax of regexps for compilation. This provides
841 for compatibility for various utilities which historically have
842 different, incompatible syntaxes.
843
844 The argument SYNTAX is a bit mask comprised of the various bits
845 defined in regex.h. We return the old syntax. */
846
847reg_syntax_t
848re_set_syntax (syntax)
849 reg_syntax_t syntax;
850{
851 reg_syntax_t ret = re_syntax_options;
852
853 re_syntax_options = syntax;
854 return ret;
855}
856\f
857/* This table gives an error message for each of the error codes listed
858 in regex.h. Obviously the order here has to be same as there. */
859#define _RERR_(n,t) t,
860static const char *re_error_msg[] = { REG_ERR_TABLE };
861# undef _RERR_
862\f
863/* Subroutine declarations and macros for regex_compile. */
864
865/* Fetch the next character in the uncompiled pattern---translating it
866 if necessary. Also cast from a signed character in the constant
867 string passed to us by the user to an unsigned char that we can use
868 as an array index (in, e.g., `translate'). */
869#define PATFETCH(c) \
870 do {if (p == pend) return REG_EEND; \
871 c = (unsigned char) *p++; \
872 if (translate) c = translate[c]; \
873 } while (0)
874
875/* Fetch the next character in the uncompiled pattern, with no
876 translation. */
877#define PATFETCH_RAW(c) \
878 do {if (p == pend) return REG_EEND; \
879 c = (unsigned char) *p++; \
880 } while (0)
881
882/* Go backwards one character in the pattern. */
883#define PATUNFETCH p--
884
885
886/* If `translate' is non-null, return translate[D], else just D. We
887 cast the subscript to translate because some data is declared as
888 `char *', to avoid warnings when a string constant is passed. But
889 when we use a character as a subscript we must make it unsigned. */
890#define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d))
891
892
893/* Macros for outputting the compiled pattern into `buffer'. */
894
895/* If the buffer isn't allocated when it comes in, use this. */
896#define INIT_BUF_SIZE 32
897
898/* Make sure we have at least N more bytes of space in buffer. */
899#define GET_BUFFER_SPACE(n) \
900 while (b - bufp->buffer + (n) > bufp->allocated) \
901 EXTEND_BUFFER ()
902
903/* Make sure we have one more byte of buffer space and then add C to it. */
904#define BUF_PUSH(c) \
905 do { \
906 GET_BUFFER_SPACE (1); \
907 *b++ = (unsigned char) (c); \
908 } while (0)
909
910
911/* Ensure we have two more bytes of buffer space and then append C1 and C2. */
912#define BUF_PUSH_2(c1, c2) \
913 do { \
914 GET_BUFFER_SPACE (2); \
915 *b++ = (unsigned char) (c1); \
916 *b++ = (unsigned char) (c2); \
917 } while (0)
918
919
920/* As with BUF_PUSH_2, except for three bytes. */
921#define BUF_PUSH_3(c1, c2, c3) \
922 do { \
923 GET_BUFFER_SPACE (3); \
924 *b++ = (unsigned char) (c1); \
925 *b++ = (unsigned char) (c2); \
926 *b++ = (unsigned char) (c3); \
927 } while (0)
928
929
930/* Store a jump with opcode OP at LOC to location TO. We store a
931 relative address offset by the three bytes the jump itself occupies. */
932#define STORE_JUMP(op, loc, to) \
933 store_op1 (op, loc, (to) - (loc) - 3)
934
935/* Likewise, for a two-argument jump. */
936#define STORE_JUMP2(op, loc, to, arg) \
937 store_op2 (op, loc, (to) - (loc) - 3, arg)
938
939/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
940#define INSERT_JUMP(op, loc, to) \
941 insert_op1 (op, loc, (to) - (loc) - 3, b)
942
943/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
944#define INSERT_JUMP2(op, loc, to, arg) \
945 insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
946
947
948/* This is not an arbitrary limit: the arguments which represent offsets
949 into the pattern are two bytes long. So if 2^16 bytes turns out to
950 be too small, many things would have to change. */
951#define MAX_BUF_SIZE (1L << 16)
952
953
954/* Extend the buffer by twice its current size via realloc and
955 reset the pointers that pointed into the old block to point to the
956 correct places in the new one. If extending the buffer results in it
957 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
958#define EXTEND_BUFFER() \
959 do { \
960 unsigned char *old_buffer = bufp->buffer; \
961 if (bufp->allocated == MAX_BUF_SIZE) \
962 return REG_ESIZE; \
963 bufp->allocated <<= 1; \
964 if (bufp->allocated > MAX_BUF_SIZE) \
965 bufp->allocated = MAX_BUF_SIZE; \
966 bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
967 if (bufp->buffer == NULL) \
968 return REG_ESPACE; \
969 /* If the buffer moved, move all the pointers into it. */ \
970 if (old_buffer != bufp->buffer) \
971 { \
972 b = (b - old_buffer) + bufp->buffer; \
973 begalt = (begalt - old_buffer) + bufp->buffer; \
974 if (fixup_alt_jump) \
975 fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
976 if (laststart) \
977 laststart = (laststart - old_buffer) + bufp->buffer; \
978 if (pending_exact) \
979 pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
980 } \
981 } while (0)
982
983
984/* Since we have one byte reserved for the register number argument to
985 {start,stop}_memory, the maximum number of groups we can report
986 things about is what fits in that byte. */
987#define MAX_REGNUM 255
988
989/* But patterns can have more than `MAX_REGNUM' registers. We just
990 ignore the excess. */
991typedef unsigned regnum_t;
992
993
994/* Macros for the compile stack. */
995
996/* Since offsets can go either forwards or backwards, this type needs to
997 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
998typedef int pattern_offset_t;
999
1000typedef struct
1001{
1002 pattern_offset_t begalt_offset;
1003 pattern_offset_t fixup_alt_jump;
1004 pattern_offset_t inner_group_offset;
1005 pattern_offset_t laststart_offset;
1006 regnum_t regnum;
1007} compile_stack_elt_t;
1008
1009
1010typedef struct
1011{
1012 compile_stack_elt_t *stack;
1013 unsigned size;
1014 unsigned avail; /* Offset of next open position. */
1015} compile_stack_type;
1016
1017
1018#define INIT_COMPILE_STACK_SIZE 32
1019
1020#define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1021#define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1022
1023/* The next available element. */
1024#define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1025
1026#ifdef __TANDEM
1027#define TYPED_SET_LIST_BIT(c) \
1028 (b[(c) / BYTEWIDTH] |= (unsigned char) (1 << ((c) % BYTEWIDTH)))
1029
1030/* Set the bit for character C in a list. */
1031#define SET_LIST_BIT(c) TYPED_SET_LIST_BIT((unsigned char)c)
1032#else
1033/* Set the bit for character C in a list. */
1034#define SET_LIST_BIT(c) \
1035 (b[((unsigned char) (c)) / BYTEWIDTH] \
1036 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1037#endif
1038
1039/* Get the next unsigned number in the uncompiled pattern. */
1040#define GET_UNSIGNED_NUMBER(num) \
1041 { if (p != pend) \
1042 { \
1043 PATFETCH (c); \
1044 while (ISDIGIT (c)) \
1045 { \
1046 if (num < 0) \
1047 num = 0; \
1048 num = num * 10 + c - '0'; \
1049 if (p == pend) \
1050 break; \
1051 PATFETCH (c); \
1052 } \
1053 } \
1054 }
1055
1056#define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
1057
1058#define IS_CHAR_CLASS(string) \
1059 (STREQ (string, "alpha") || STREQ (string, "upper") \
1060 || STREQ (string, "lower") || STREQ (string, "digit") \
1061 || STREQ (string, "alnum") || STREQ (string, "xdigit") \
1062 || STREQ (string, "space") || STREQ (string, "print") \
1063 || STREQ (string, "punct") || STREQ (string, "graph") \
1064 || STREQ (string, "cntrl") || STREQ (string, "blank"))
1065\f
1066static void store_op1 (), store_op2 ();
1067static void insert_op1 (), insert_op2 ();
1068static boolean at_begline_loc_p (), at_endline_loc_p ();
1069#ifndef __TANDEM
1070static boolean group_in_compile_stack ();
1071#else
1072static boolean group_in_compile_stack (compile_stack_type*, regnum_t);
1073#endif
1074static reg_errcode_t compile_range ();
1075
1076\f
1077/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
1078 Returns one of error codes defined in `regex.h', or zero for success.
1079
1080 Assumes the `allocated' (and perhaps `buffer') and `translate'
1081 fields are set in BUFP on entry.
1082
1083 If it succeeds, results are put in BUFP (if it returns an error, the
1084 contents of BUFP are undefined):
1085 `buffer' is the compiled pattern;
1086 `syntax' is set to SYNTAX;
1087 `used' is set to the length of the compiled pattern;
1088 `fastmap_accurate' is zero;
1089 `re_nsub' is the number of subexpressions in PATTERN;
1090 `not_bol' and `not_eol' are zero;
1091
1092 The `fastmap' and `newline_anchor' fields are neither
1093 examined nor set. */
1094
1095static reg_errcode_t
1096regex_compile (pattern, size, syntax, bufp)
1097 const char *pattern;
1098 int size;
1099 reg_syntax_t syntax;
1100 struct re_pattern_buffer *bufp;
1101{
1102 /* We fetch characters from PATTERN here. Even though PATTERN is
1103 `char *' (i.e., signed), we declare these variables as unsigned, so
1104 they can be reliably used as array indices. */
1105 register unsigned char c, c1;
1106
1107 /* A random tempory spot in PATTERN. */
1108 const char *p1;
1109
1110 /* Points to the end of the buffer, where we should append. */
1111 register unsigned char *b;
1112
1113 /* Keeps track of unclosed groups. */
1114 compile_stack_type compile_stack;
1115
1116 /* Points to the current (ending) position in the pattern. */
1117 const char *p = pattern;
1118 const char *pend = pattern + size;
1119
1120 /* How to translate the characters in the pattern. */
1121 char *translate = bufp->translate;
1122
1123 /* Address of the count-byte of the most recently inserted `exactn'
1124 command. This makes it possible to tell if a new exact-match
1125 character can be added to that command or if the character requires
1126 a new `exactn' command. */
1127 unsigned char *pending_exact = 0;
1128
1129 /* Address of start of the most recently finished expression.
1130 This tells, e.g., postfix * where to find the start of its
1131 operand. Reset at the beginning of groups and alternatives. */
1132 unsigned char *laststart = 0;
1133
1134 /* Address of beginning of regexp, or inside of last group. */
1135 unsigned char *begalt;
1136
1137 /* Place in the uncompiled pattern (i.e., the {) to
1138 which to go back if the interval is invalid. */
1139 const char *beg_interval;
1140
1141 /* Address of the place where a forward jump should go to the end of
1142 the containing expression. Each alternative of an `or' -- except the
1143 last -- ends with a forward jump of this sort. */
1144 unsigned char *fixup_alt_jump = 0;
1145
1146 /* Counts open-groups as they are encountered. Remembered for the
1147 matching close-group on the compile stack, so the same register
1148 number is put in the stop_memory as the start_memory. */
1149 regnum_t regnum = 0;
1150
1151#ifdef DEBUG
1152 DEBUG_PRINT1 ("\nCompiling pattern: ");
1153 if (debug)
1154 {
1155 unsigned debug_count;
1156
1157 for (debug_count = 0; debug_count < size; debug_count++)
1158 printchar (pattern[debug_count]);
1159 putchar ('\n');
1160 }
1161#endif /* DEBUG */
1162
1163 /* Initialize the compile stack. */
1164 compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
1165 if (compile_stack.stack == NULL)
1166 return REG_ESPACE;
1167
1168 compile_stack.size = INIT_COMPILE_STACK_SIZE;
1169 compile_stack.avail = 0;
1170
1171 /* Initialize the pattern buffer. */
1172 bufp->syntax = syntax;
1173 bufp->fastmap_accurate = 0;
1174 bufp->not_bol = bufp->not_eol = 0;
1175
1176 /* Set `used' to zero, so that if we return an error, the pattern
1177 printer (for debugging) will think there's no pattern. We reset it
1178 at the end. */
1179 bufp->used = 0;
1180
1181 /* Always count groups, whether or not bufp->no_sub is set. */
1182 bufp->re_nsub = 0;
1183
1184#if !defined (emacs) && !defined (SYNTAX_TABLE)
1185 /* Initialize the syntax table. */
1186#ifdef __TANDEM
1187 /*
1188 * How can any compiler be so amazingly brain dead?
1189 */
1190#pragma NOWARN( 95 )
1191#endif
1192 init_syntax_once ();
1193#endif
1194
1195 if (bufp->allocated == 0)
1196 {
1197 if (bufp->buffer)
1198 { /* If zero allocated, but buffer is non-null, try to realloc
1199 enough space. This loses if buffer's address is bogus, but
1200 that is the user's responsibility. */
1201 RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
1202 }
1203 else
1204 { /* Caller did not allocate a buffer. Do it for them. */
1205 bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
1206 }
1207 if (!bufp->buffer) return REG_ESPACE;
1208
1209 bufp->allocated = INIT_BUF_SIZE;
1210 }
1211
1212 begalt = b = bufp->buffer;
1213
1214 /* Loop through the uncompiled pattern until we're at the end. */
1215 while (p != pend)
1216 {
1217 PATFETCH (c);
1218
1219 switch (c)
1220 {
1221 case '^':
1222 {
1223 if ( /* If at start of pattern, it's an operator. */
1224 p == pattern + 1
1225 /* If context independent, it's an operator. */
1226 || syntax & RE_CONTEXT_INDEP_ANCHORS
1227 /* Otherwise, depends on what's come before. */
1228 || at_begline_loc_p (pattern, p, syntax))
1229 BUF_PUSH (begline);
1230 else
1231 goto normal_char;
1232 }
1233 break;
1234
1235
1236 case '$':
1237 {
1238 if ( /* If at end of pattern, it's an operator. */
1239 p == pend
1240 /* If context independent, it's an operator. */
1241 || syntax & RE_CONTEXT_INDEP_ANCHORS
1242 /* Otherwise, depends on what's next. */
1243 || at_endline_loc_p (p, pend, syntax))
1244 BUF_PUSH (endline);
1245 else
1246 goto normal_char;
1247 }
1248 break;
1249
1250
1251 case '+':
1252 case '?':
1253 if ((syntax & RE_BK_PLUS_QM)
1254 || (syntax & RE_LIMITED_OPS))
1255 goto normal_char;
1256 handle_plus:
1257 case '*':
1258 /* If there is no previous pattern... */
1259 if (!laststart)
1260 {
1261 if (syntax & RE_CONTEXT_INVALID_OPS)
1262 return REG_BADRPT;
1263 else if (!(syntax & RE_CONTEXT_INDEP_OPS))
1264 goto normal_char;
1265 }
1266
1267 {
1268 /* Are we optimizing this jump? */
1269 boolean keep_string_p = false;
1270
1271 /* 1 means zero (many) matches is allowed. */
1272 char zero_times_ok = 0, many_times_ok = 0;
1273
1274 /* If there is a sequence of repetition chars, collapse it
1275 down to just one (the right one). We can't combine
1276 interval operators with these because of, e.g., `a{2}*',
1277 which should only match an even number of `a's. */
1278
1279 for (;;)
1280 {
1281 zero_times_ok |= c != '+';
1282 many_times_ok |= c != '?';
1283
1284 if (p == pend)
1285 break;
1286
1287 PATFETCH (c);
1288
1289 if (c == '*'
1290 || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
1291 ;
1292
1293 else if (syntax & RE_BK_PLUS_QM && c == '\\')
1294 {
1295 if (p == pend) return REG_EESCAPE;
1296
1297 PATFETCH (c1);
1298 if (!(c1 == '+' || c1 == '?'))
1299 {
1300 PATUNFETCH;
1301 PATUNFETCH;
1302 break;
1303 }
1304
1305 c = c1;
1306 }
1307 else
1308 {
1309 PATUNFETCH;
1310 break;
1311 }
1312
1313 /* If we get here, we found another repeat character. */
1314 }
1315
1316 /* Star, etc. applied to an empty pattern is equivalent
1317 to an empty pattern. */
1318 if (!laststart)
1319 break;
1320
1321 /* Now we know whether or not zero matches is allowed
1322 and also whether or not two or more matches is allowed. */
1323 if (many_times_ok)
1324 { /* More than one repetition is allowed, so put in at the
1325 end a backward relative jump from `b' to before the next
1326 jump we're going to put in below (which jumps from
1327 laststart to after this jump).
1328
1329 But if we are at the `*' in the exact sequence `.*\n',
1330 insert an unconditional jump backwards to the .,
1331 instead of the beginning of the loop. This way we only
1332 push a failure point once, instead of every time
1333 through the loop. */
1334 assert (p - 1 > pattern);
1335
1336 /* Allocate the space for the jump. */
1337 GET_BUFFER_SPACE (3);
1338
1339 /* We know we are not at the first character of the pattern,
1340 because laststart was nonzero. And we've already
1341 incremented `p', by the way, to be the character after
1342 the `*'. Do we have to do something analogous here
1343 for null bytes, because of RE_DOT_NOT_NULL? */
1344 if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
1345 && zero_times_ok
1346 && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
1347 && !(syntax & RE_DOT_NEWLINE))
1348 { /* We have .*\n. */
1349 STORE_JUMP (jump, b, laststart);
1350 keep_string_p = true;
1351 }
1352 else
1353 /* Anything else. */
1354 STORE_JUMP (maybe_pop_jump, b, laststart - 3);
1355
1356 /* We've added more stuff to the buffer. */
1357 b += 3;
1358 }
1359
1360 /* On failure, jump from laststart to b + 3, which will be the
1361 end of the buffer after this jump is inserted. */
1362 GET_BUFFER_SPACE (3);
1363 INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
1364 : on_failure_jump,
1365 laststart, b + 3);
1366 pending_exact = 0;
1367 b += 3;
1368
1369 if (!zero_times_ok)
1370 {
1371 /* At least one repetition is required, so insert a
1372 `dummy_failure_jump' before the initial
1373 `on_failure_jump' instruction of the loop. This
1374 effects a skip over that instruction the first time
1375 we hit that loop. */
1376 GET_BUFFER_SPACE (3);
1377 INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
1378 b += 3;
1379 }
1380 }
1381 break;
1382
1383
1384 case '.':
1385 laststart = b;
1386 BUF_PUSH (anychar);
1387 break;
1388
1389
1390 case '[':
1391 {
1392 boolean had_char_class = false;
1393
1394 if (p == pend) return REG_EBRACK;
1395
1396 /* Ensure that we have enough space to push a charset: the
1397 opcode, the length count, and the bitset; 34 bytes in all. */
1398 GET_BUFFER_SPACE (34);
1399
1400 laststart = b;
1401
1402 /* We test `*p == '^' twice, instead of using an if
1403 statement, so we only need one BUF_PUSH. */
1404 BUF_PUSH (*p == '^' ? charset_not : charset);
1405 if (*p == '^')
1406 p++;
1407
1408 /* Remember the first position in the bracket expression. */
1409 p1 = p;
1410
1411 /* Push the number of bytes in the bitmap. */
1412 BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
1413
1414 /* Clear the whole map. */
1415 bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
1416
1417 /* charset_not matches newline according to a syntax bit. */
1418 if ((re_opcode_t) b[-2] == charset_not
1419 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
1420 SET_LIST_BIT ('\n');
1421
1422 /* Read in characters and ranges, setting map bits. */
1423 for (;;)
1424 {
1425 if (p == pend) return REG_EBRACK;
1426
1427 PATFETCH (c);
1428
1429 /* \ might escape characters inside [...] and [^...]. */
1430 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
1431 {
1432 if (p == pend) return REG_EESCAPE;
1433
1434 PATFETCH (c1);
1435 SET_LIST_BIT (c1);
1436 continue;
1437 }
1438
1439 /* Could be the end of the bracket expression. If it's
1440 not (i.e., when the bracket expression is `[]' so
1441 far), the ']' character bit gets set way below. */
1442 if (c == ']' && p != p1 + 1)
1443 break;
1444
1445 /* Look ahead to see if it's a range when the last thing
1446 was a character class. */
1447 if (had_char_class && c == '-' && *p != ']')
1448 return REG_ERANGE;
1449
1450 /* Look ahead to see if it's a range when the last thing
1451 was a character: if this is a hyphen not at the
1452 beginning or the end of a list, then it's the range
1453 operator. */
1454 if (c == '-'
1455 && !(p - 2 >= pattern && p[-2] == '[')
1456 && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
1457 && *p != ']')
1458 {
1459 reg_errcode_t ret
1460 = compile_range (&p, pend, translate, syntax, b);
1461 if (ret != REG_NOERROR) return ret;
1462 }
1463
1464 else if (p[0] == '-' && p[1] != ']')
1465 { /* This handles ranges made up of characters only. */
1466 reg_errcode_t ret;
1467
1468 /* Move past the `-'. */
1469 PATFETCH (c1);
1470
1471 ret = compile_range (&p, pend, translate, syntax, b);
1472 if (ret != REG_NOERROR) return ret;
1473 }
1474
1475 /* See if we're at the beginning of a possible character
1476 class. */
1477
1478 else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
1479 { /* Leave room for the null. */
1480 char str[CHAR_CLASS_MAX_LENGTH + 1];
1481
1482 PATFETCH (c);
1483 c1 = 0;
1484
1485 /* If pattern is `[[:'. */
1486 if (p == pend) return REG_EBRACK;
1487
1488 for (;;)
1489 {
1490 PATFETCH (c);
1491 if (c == ':' || c == ']' || p == pend
1492 || c1 == CHAR_CLASS_MAX_LENGTH)
1493 break;
1494 str[c1++] = c;
1495 }
1496 str[c1] = '\0';
1497
1498 /* If isn't a word bracketed by `[:' and:`]':
1499 undo the ending character, the letters, and leave
1500 the leading `:' and `[' (but set bits for them). */
1501 if (c == ':' && *p == ']')
1502 {
1503 int ch;
1504 boolean is_alnum = STREQ (str, "alnum");
1505 boolean is_alpha = STREQ (str, "alpha");
1506 boolean is_blank = STREQ (str, "blank");
1507 boolean is_cntrl = STREQ (str, "cntrl");
1508 boolean is_digit = STREQ (str, "digit");
1509 boolean is_graph = STREQ (str, "graph");
1510 boolean is_lower = STREQ (str, "lower");
1511 boolean is_print = STREQ (str, "print");
1512 boolean is_punct = STREQ (str, "punct");
1513 boolean is_space = STREQ (str, "space");
1514 boolean is_upper = STREQ (str, "upper");
1515 boolean is_xdigit = STREQ (str, "xdigit");
1516
1517 if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
1518
1519 /* Throw away the ] at the end of the character
1520 class. */
1521 PATFETCH (c);
1522
1523 if (p == pend) return REG_EBRACK;
1524
1525 for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
1526 {
1527 if ( (is_alnum && ISALNUM (ch))
1528 || (is_alpha && ISALPHA (ch))
1529 || (is_blank && ISBLANK (ch))
1530 || (is_cntrl && ISCNTRL (ch))
1531 || (is_digit && ISDIGIT (ch))
1532 || (is_graph && ISGRAPH (ch))
1533 || (is_lower && ISLOWER (ch))
1534 || (is_print && ISPRINT (ch))
1535 || (is_punct && ISPUNCT (ch))
1536 || (is_space && ISSPACE (ch))
1537 || (is_upper && ISUPPER (ch))
1538 || (is_xdigit && ISXDIGIT (ch)))
1539 SET_LIST_BIT (ch);
1540 }
1541 had_char_class = true;
1542 }
1543 else
1544 {
1545 c1++;
1546 while (c1--)
1547 PATUNFETCH;
1548 SET_LIST_BIT ('[');
1549 SET_LIST_BIT (':');
1550 had_char_class = false;
1551 }
1552 }
1553 else
1554 {
1555 had_char_class = false;
1556 SET_LIST_BIT (c);
1557 }
1558 }
1559
1560 /* Discard any (non)matching list bytes that are all 0 at the
1561 end of the map. Decrease the map-length byte too. */
1562 while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
1563 b[-1]--;
1564 b += b[-1];
1565 }
1566 break;
1567
1568
1569 case '(':
1570 if (syntax & RE_NO_BK_PARENS)
1571 goto handle_open;
1572 else
1573 goto normal_char;
1574
1575
1576 case ')':
1577 if (syntax & RE_NO_BK_PARENS)
1578 goto handle_close;
1579 else
1580 goto normal_char;
1581
1582
1583 case '\n':
1584 if (syntax & RE_NEWLINE_ALT)
1585 goto handle_alt;
1586 else
1587 goto normal_char;
1588
1589
1590 case '|':
1591 if (syntax & RE_NO_BK_VBAR)
1592 goto handle_alt;
1593 else
1594 goto normal_char;
1595
1596
1597 case '{':
1598 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
1599 goto handle_interval;
1600 else
1601 goto normal_char;
1602
1603
1604 case '\\':
1605 if (p == pend) return REG_EESCAPE;
1606
1607 /* Do not translate the character after the \, so that we can
1608 distinguish, e.g., \B from \b, even if we normally would
1609 translate, e.g., B to b. */
1610 PATFETCH_RAW (c);
1611
1612 switch (c)
1613 {
1614 case '(':
1615 if (syntax & RE_NO_BK_PARENS)
1616 goto normal_backslash;
1617
1618 handle_open:
1619 bufp->re_nsub++;
1620 regnum++;
1621
1622 if (COMPILE_STACK_FULL)
1623 {
1624 RETALLOC (compile_stack.stack, compile_stack.size << 1,
1625 compile_stack_elt_t);
1626 if (compile_stack.stack == NULL) return REG_ESPACE;
1627
1628 compile_stack.size <<= 1;
1629 }
1630
1631 /* These are the values to restore when we hit end of this
1632 group. They are all relative offsets, so that if the
1633 whole pattern moves because of realloc, they will still
1634 be valid. */
1635 COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
1636 COMPILE_STACK_TOP.fixup_alt_jump
1637 = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1638 COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
1639 COMPILE_STACK_TOP.regnum = regnum;
1640
1641 /* We will eventually replace the 0 with the number of
1642 groups inner to this one. But do not push a
1643 start_memory for groups beyond the last one we can
1644 represent in the compiled pattern. */
1645 if (regnum <= MAX_REGNUM)
1646 {
1647 COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
1648 BUF_PUSH_3 (start_memory, regnum, 0);
1649 }
1650
1651 compile_stack.avail++;
1652
1653 fixup_alt_jump = 0;
1654 laststart = 0;
1655 begalt = b;
1656 /* If we've reached MAX_REGNUM groups, then this open
1657 won't actually generate any code, so we'll have to
1658 clear pending_exact explicitly. */
1659 pending_exact = 0;
1660 break;
1661
1662
1663 case ')':
1664 if (syntax & RE_NO_BK_PARENS) goto normal_backslash;
1665
1666 if (COMPILE_STACK_EMPTY)
1667 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1668 goto normal_backslash;
1669 else
1670 return REG_ERPAREN;
1671
1672 handle_close:
1673 if (fixup_alt_jump)
1674 { /* Push a dummy failure point at the end of the
1675 alternative for a possible future
1676 `pop_failure_jump' to pop. See comments at
1677 `push_dummy_failure' in `re_match_2'. */
1678 BUF_PUSH (push_dummy_failure);
1679
1680 /* We allocated space for this jump when we assigned
1681 to `fixup_alt_jump', in the `handle_alt' case below. */
1682 STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1);
1683 }
1684
1685 /* See similar code for backslashed left paren above. */
1686 if (COMPILE_STACK_EMPTY)
1687 if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)
1688 goto normal_char;
1689 else
1690 return REG_ERPAREN;
1691
1692 /* Since we just checked for an empty stack above, this
1693 ``can't happen''. */
1694 assert (compile_stack.avail != 0);
1695 {
1696 /* We don't just want to restore into `regnum', because
1697 later groups should continue to be numbered higher,
1698 as in `(ab)c(de)' -- the second group is #2. */
1699 regnum_t this_group_regnum;
1700
1701 compile_stack.avail--;
1702 begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
1703 fixup_alt_jump
1704 = COMPILE_STACK_TOP.fixup_alt_jump
1705 ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
1706 : 0;
1707 laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
1708 this_group_regnum = COMPILE_STACK_TOP.regnum;
1709 /* If we've reached MAX_REGNUM groups, then this open
1710 won't actually generate any code, so we'll have to
1711 clear pending_exact explicitly. */
1712 pending_exact = 0;
1713
1714 /* We're at the end of the group, so now we know how many
1715 groups were inside this one. */
1716 if (this_group_regnum <= MAX_REGNUM)
1717 {
1718 unsigned char *inner_group_loc
1719 = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
1720
1721 *inner_group_loc = (unsigned char)
1722 (regnum - this_group_regnum);
1723 BUF_PUSH_3 (stop_memory, this_group_regnum,
1724 regnum - this_group_regnum);
1725 }
1726 }
1727 break;
1728
1729
1730 case '|': /* `\|'. */
1731 if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR)
1732 goto normal_backslash;
1733 handle_alt:
1734 if (syntax & RE_LIMITED_OPS)
1735 goto normal_char;
1736
1737 /* Insert before the previous alternative a jump which
1738 jumps to this alternative if the former fails. */
1739 GET_BUFFER_SPACE (3);
1740 INSERT_JUMP (on_failure_jump, begalt, b + 6);
1741 pending_exact = 0;
1742 b += 3;
1743
1744 /* The alternative before this one has a jump after it
1745 which gets executed if it gets matched. Adjust that
1746 jump so it will jump to this alternative's analogous
1747 jump (put in below, which in turn will jump to the next
1748 (if any) alternative's such jump, etc.). The last such
1749 jump jumps to the correct final destination. A picture:
1750 _____ _____
1751 | | | |
1752 | v | v
1753 a | b | c
1754
1755 If we are at `b', then fixup_alt_jump right now points to a
1756 three-byte space after `a'. We'll put in the jump, set
1757 fixup_alt_jump to right after `b', and leave behind three
1758 bytes which we'll fill in when we get to after `c'. */
1759
1760 if (fixup_alt_jump)
1761 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
1762
1763 /* Mark and leave space for a jump after this alternative,
1764 to be filled in later either by next alternative or
1765 when know we're at the end of a series of alternatives. */
1766 fixup_alt_jump = b;
1767 GET_BUFFER_SPACE (3);
1768 b += 3;
1769
1770 laststart = 0;
1771 begalt = b;
1772 break;
1773
1774
1775 case '{':
1776 /* If \{ is a literal. */
1777 if (!(syntax & RE_INTERVALS)
1778 /* If we're at `\{' and it's not the open-interval
1779 operator. */
1780 || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1781 || (p - 2 == pattern && p == pend))
1782 goto normal_backslash;
1783
1784 handle_interval:
1785 {
1786 /* If got here, then the syntax allows intervals. */
1787
1788 /* At least (most) this many matches must be made. */
1789 int lower_bound = -1, upper_bound = -1;
1790
1791 beg_interval = p - 1;
1792
1793 if (p == pend)
1794 {
1795 if (syntax & RE_NO_BK_BRACES)
1796 goto unfetch_interval;
1797 else
1798 return REG_EBRACE;
1799 }
1800
1801 GET_UNSIGNED_NUMBER (lower_bound);
1802
1803 if (c == ',')
1804 {
1805 GET_UNSIGNED_NUMBER (upper_bound);
1806 if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1807 }
1808 else
1809 /* Interval such as `{1}' => match exactly once. */
1810 upper_bound = lower_bound;
1811
1812 if (lower_bound < 0 || upper_bound > RE_DUP_MAX
1813 || lower_bound > upper_bound)
1814 {
1815 if (syntax & RE_NO_BK_BRACES)
1816 goto unfetch_interval;
1817 else
1818 return REG_BADBR;
1819 }
1820
1821 if (!(syntax & RE_NO_BK_BRACES))
1822 {
1823 if (c != '\\') return REG_EBRACE;
1824
1825 PATFETCH (c);
1826 }
1827
1828 if (c != '}')
1829 {
1830 if (syntax & RE_NO_BK_BRACES)
1831 goto unfetch_interval;
1832 else
1833 return REG_BADBR;
1834 }
1835
1836 /* We just parsed a valid interval. */
1837
1838 /* If it's invalid to have no preceding re. */
1839 if (!laststart)
1840 {
1841 if (syntax & RE_CONTEXT_INVALID_OPS)
1842 return REG_BADRPT;
1843 else if (syntax & RE_CONTEXT_INDEP_OPS)
1844 laststart = b;
1845 else
1846 goto unfetch_interval;
1847 }
1848
1849 /* If the upper bound is zero, don't want to succeed at
1850 all; jump from `laststart' to `b + 3', which will be
1851 the end of the buffer after we insert the jump. */
1852 if (upper_bound == 0)
1853 {
1854 GET_BUFFER_SPACE (3);
1855 INSERT_JUMP (jump, laststart, b + 3);
1856 b += 3;
1857 }
1858
1859 /* Otherwise, we have a nontrivial interval. When
1860 we're all done, the pattern will look like:
1861 set_number_at <jump count> <upper bound>
1862 set_number_at <succeed_n count> <lower bound>
1863 succeed_n <after jump addr> <succed_n count>
1864 <body of loop>
1865 jump_n <succeed_n addr> <jump count>
1866 (The upper bound and `jump_n' are omitted if
1867 `upper_bound' is 1, though.) */
1868 else
1869 { /* If the upper bound is > 1, we need to insert
1870 more at the end of the loop. */
1871 unsigned nbytes = 10 + (upper_bound > 1) * 10;
1872
1873 GET_BUFFER_SPACE (nbytes);
1874
1875 /* Initialize lower bound of the `succeed_n', even
1876 though it will be set during matching by its
1877 attendant `set_number_at' (inserted next),
1878 because `re_compile_fastmap' needs to know.
1879 Jump to the `jump_n' we might insert below. */
1880 INSERT_JUMP2 (succeed_n, laststart,
1881 b + 5 + (upper_bound > 1) * 5,
1882 lower_bound);
1883 b += 5;
1884
1885 /* Code to initialize the lower bound. Insert
1886 before the `succeed_n'. The `5' is the last two
1887 bytes of this `set_number_at', plus 3 bytes of
1888 the following `succeed_n'. */
1889 insert_op2 (set_number_at, laststart, 5, lower_bound, b);
1890 b += 5;
1891
1892 if (upper_bound > 1)
1893 { /* More than one repetition is allowed, so
1894 append a backward jump to the `succeed_n'
1895 that starts this interval.
1896
1897 When we've reached this during matching,
1898 we'll have matched the interval once, so
1899 jump back only `upper_bound - 1' times. */
1900 STORE_JUMP2 (jump_n, b, laststart + 5,
1901 upper_bound - 1);
1902 b += 5;
1903
1904 /* The location we want to set is the second
1905 parameter of the `jump_n'; that is `b-2' as
1906 an absolute address. `laststart' will be
1907 the `set_number_at' we're about to insert;
1908 `laststart+3' the number to set, the source
1909 for the relative address. But we are
1910 inserting into the middle of the pattern --
1911 so everything is getting moved up by 5.
1912 Conclusion: (b - 2) - (laststart + 3) + 5,
1913 i.e., b - laststart.
1914
1915 We insert this at the beginning of the loop
1916 so that if we fail during matching, we'll
1917 reinitialize the bounds. */
1918 insert_op2 (set_number_at, laststart, b - laststart,
1919 upper_bound - 1, b);
1920 b += 5;
1921 }
1922 }
1923 pending_exact = 0;
1924 beg_interval = NULL;
1925 }
1926 break;
1927
1928 unfetch_interval:
1929 /* If an invalid interval, match the characters as literals. */
1930 assert (beg_interval);
1931 p = beg_interval;
1932 beg_interval = NULL;
1933
1934 /* normal_char and normal_backslash need `c'. */
1935 PATFETCH (c);
1936
1937 if (!(syntax & RE_NO_BK_BRACES))
1938 {
1939 if (p > pattern && p[-1] == '\\')
1940 goto normal_backslash;
1941 }
1942 goto normal_char;
1943
1944#ifdef emacs
1945 /* There is no way to specify the before_dot and after_dot
1946 operators. rms says this is ok. --karl */
1947 case '=':
1948 BUF_PUSH (at_dot);
1949 break;
1950
1951 case 's':
1952 laststart = b;
1953 PATFETCH (c);
1954 BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]);
1955 break;
1956
1957 case 'S':
1958 laststart = b;
1959 PATFETCH (c);
1960 BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]);
1961 break;
1962#endif /* emacs */
1963
1964
1965 case 'w':
1966 laststart = b;
1967 BUF_PUSH (wordchar);
1968 break;
1969
1970
1971 case 'W':
1972 laststart = b;
1973 BUF_PUSH (notwordchar);
1974 break;
1975
1976
1977 case '<':
1978 BUF_PUSH (wordbeg);
1979 break;
1980
1981 case '>':
1982 BUF_PUSH (wordend);
1983 break;
1984
1985 case 'b':
1986 BUF_PUSH (wordbound);
1987 break;
1988
1989 case 'B':
1990 BUF_PUSH (notwordbound);
1991 break;
1992
1993 case '`':
1994 BUF_PUSH (begbuf);
1995 break;
1996
1997 case '\'':
1998 BUF_PUSH (endbuf);
1999 break;
2000
2001 case '1': case '2': case '3': case '4': case '5':
2002 case '6': case '7': case '8': case '9':
2003 if (syntax & RE_NO_BK_REFS)
2004 goto normal_char;
2005
2006 c1 = c - '0';
2007
2008 if (c1 > regnum)
2009 return REG_ESUBREG;
2010
2011 /* Can't back reference to a subexpression if inside of it. */
2012 if (group_in_compile_stack( &compile_stack, c1))
2013 goto normal_char;
2014
2015 laststart = b;
2016 BUF_PUSH_2 (duplicate, c1);
2017 break;
2018
2019
2020 case '+':
2021 case '?':
2022 if (syntax & RE_BK_PLUS_QM)
2023 goto handle_plus;
2024 else
2025 goto normal_backslash;
2026
2027 default:
2028 normal_backslash:
2029 /* You might think it would be useful for \ to mean
38e01259 2030 not to translate; but if we don't translate it,
99024956
BK
2031 it will never match anything. */
2032 c = TRANSLATE (c);
2033 goto normal_char;
2034 }
2035 break;
2036
2037
2038 default:
2039 /* Expects the character in `c'. */
2040 normal_char:
2041 /* If no exactn currently being built. */
2042 if (!pending_exact
2043
2044 /* If last exactn not at current position. */
2045 || pending_exact + *pending_exact + 1 != b
2046
2047 /* We have only one byte following the exactn for the count. */
2048 || *pending_exact == (1 << BYTEWIDTH) - 1
2049
2050 /* If followed by a repetition operator. */
2051 || *p == '*' || *p == '^'
2052 || ((syntax & RE_BK_PLUS_QM)
2053 ? *p == '\\' && (p[1] == '+' || p[1] == '?')
2054 : (*p == '+' || *p == '?'))
2055 || ((syntax & RE_INTERVALS)
2056 && ((syntax & RE_NO_BK_BRACES)
2057 ? *p == '{'
2058 : (p[0] == '\\' && p[1] == '{'))))
2059 {
2060 /* Start building a new exactn. */
2061
2062 laststart = b;
2063
2064 BUF_PUSH_2 (exactn, 0);
2065 pending_exact = b - 1;
2066 }
2067
2068 BUF_PUSH (c);
2069 (*pending_exact)++;
2070 break;
2071 } /* switch (c) */
2072 } /* while p != pend */
2073
2074
2075 /* Through the pattern now. */
2076
2077 if (fixup_alt_jump)
2078 STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
2079
2080 if (!COMPILE_STACK_EMPTY)
2081 return REG_EPAREN;
2082
2083 free (compile_stack.stack);
2084
2085 /* We have succeeded; set the length of the buffer. */
2086 bufp->used = b - bufp->buffer;
2087
2088#ifdef DEBUG
2089 if (debug)
2090 {
2091 DEBUG_PRINT1 ("\nCompiled pattern: ");
2092 print_compiled_pattern (bufp);
2093 }
2094#endif /* DEBUG */
2095
2096 return REG_NOERROR;
2097} /* regex_compile */
2098\f
2099/* Subroutines for `regex_compile'. */
2100
2101/* Store OP at LOC followed by two-byte integer parameter ARG. */
2102
2103static void
2104store_op1 (op, loc, arg)
2105 re_opcode_t op;
2106 unsigned char *loc;
2107 int arg;
2108{
2109 *loc = (unsigned char) op;
2110 STORE_NUMBER (loc + 1, arg);
2111}
2112
2113
2114/* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
2115
2116static void
2117store_op2 (op, loc, arg1, arg2)
2118 re_opcode_t op;
2119 unsigned char *loc;
2120 int arg1, arg2;
2121{
2122 *loc = (unsigned char) op;
2123 STORE_NUMBER (loc + 1, arg1);
2124 STORE_NUMBER (loc + 3, arg2);
2125}
2126
2127
2128/* Copy the bytes from LOC to END to open up three bytes of space at LOC
2129 for OP followed by two-byte integer parameter ARG. */
2130
2131static void
2132insert_op1 (op, loc, arg, end)
2133 re_opcode_t op;
2134 unsigned char *loc;
2135 int arg;
2136 unsigned char *end;
2137{
2138 register unsigned char *pfrom = end;
2139 register unsigned char *pto = end + 3;
2140
2141 while (pfrom != loc)
2142 *--pto = *--pfrom;
2143
2144 store_op1 (op, loc, arg);
2145}
2146
2147
2148/* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
2149
2150static void
2151insert_op2 (op, loc, arg1, arg2, end)
2152 re_opcode_t op;
2153 unsigned char *loc;
2154 int arg1, arg2;
2155 unsigned char *end;
2156{
2157 register unsigned char *pfrom = end;
2158 register unsigned char *pto = end + 5;
2159
2160 while (pfrom != loc)
2161 *--pto = *--pfrom;
2162
2163 store_op2 (op, loc, arg1, arg2);
2164}
2165
2166
2167/* P points to just after a ^ in PATTERN. Return true if that ^ comes
2168 after an alternative or a begin-subexpression. We assume there is at
2169 least one character before the ^. */
2170
2171static boolean
2172at_begline_loc_p (pattern, p, syntax)
2173 const char *pattern, *p;
2174 reg_syntax_t syntax;
2175{
2176 const char *prev = p - 2;
2177# define prev_prev_backslash(p) (((p) > pattern) && ((p)[-1] == '\\'))
2178
2179 switch (*prev) {
2180 case '(':
2181 return (boolean)(
2182 prev_prev_backslash(prev) || ((syntax & RE_NO_BK_PARENS) != 0));
2183
2184 case '|':
2185 return (boolean)(
2186 prev_prev_backslash(prev) || ((syntax & RE_NO_BK_VBAR) != 0));
2187 }
2188 return false;
2189}
2190
2191
2192/* The dual of at_begline_loc_p. This one is for $. We assume there is
2193 at least one character after the $, i.e., `P < PEND'. */
2194
2195static boolean
2196at_endline_loc_p (p, pend, syntax)
2197 const char *p, *pend;
2198 int syntax;
2199{
2200 const char *next = p;
2201 boolean next_backslash = *next == '\\';
2202 const char *next_next = p + 1 < pend ? p + 1 : NULL;
2203#ifdef __TANDEM
2204# pragma NOWARN( 85 )
2205#endif
2206 return
2207 /* Before a subexpression? */
2208 (syntax & RE_NO_BK_PARENS ? *next == ')'
2209 : next_backslash && next_next && *next_next == ')')
2210 /* Before an alternative? */
2211 || (syntax & RE_NO_BK_VBAR ? *next == '|'
2212 : next_backslash && next_next && *next_next == '|');
2213}
2214#ifdef __TANDEM
2215# pragma WARN( 85 )
2216#endif
2217
2218
2219/* Returns true if REGNUM is in one of COMPILE_STACK's elements and
2220 false if it's not. */
2221static boolean
2222group_in_compile_stack( compile_stack_type* compile_stack,
2223 regnum_t regnum )
2224{
2225 int this_element;
2226
2227 for (this_element = compile_stack->avail - 1;
2228 this_element >= 0;
2229 this_element--)
2230 if (compile_stack->stack[this_element].regnum == regnum)
2231 return true;
2232
2233 return false;
2234}
2235#undef CS
2236
2237/* Read the ending character of a range (in a bracket expression) from the
2238 uncompiled pattern *P_PTR (which ends at PEND). We assume the
2239 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
2240 Then we set the translation of all bits between the starting and
2241 ending characters (inclusive) in the compiled pattern B.
2242
2243 Return an error code.
2244
2245 We use these short variable names so we can use the same macros as
2246 `regex_compile' itself. */
2247
2248static reg_errcode_t
2249compile_range (p_ptr, pend, translate, syntax, b)
2250 const char **p_ptr, *pend;
2251 char *translate;
2252 reg_syntax_t syntax;
2253 unsigned char *b;
2254{
2255 unsigned this_char;
2256
2257 const char *p = *p_ptr;
2258 int range_start, range_end;
2259
2260 if (p == pend)
2261 return REG_ERANGE;
2262
2263 /* Even though the pattern is a signed `char *', we need to fetch
2264 with unsigned char *'s; if the high bit of the pattern character
2265 is set, the range endpoints will be negative if we fetch using a
2266 signed char *.
2267
2268 We also want to fetch the endpoints without translating them; the
2269 appropriate translation is done in the bit-setting loop below. */
2270 range_start = ((unsigned char *) p)[-2];
2271 range_end = ((unsigned char *) p)[0];
2272
2273 /* Have to increment the pointer into the pattern string, so the
2274 caller isn't still at the ending character. */
2275 (*p_ptr)++;
2276
2277 /* If the start is after the end, the range is empty. */
2278 if (range_start > range_end)
2279 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
2280
2281 /* Here we see why `this_char' has to be larger than an `unsigned
2282 char' -- the range is inclusive, so if `range_end' == 0xff
2283 (assuming 8-bit characters), we would otherwise go into an infinite
2284 loop, since all characters <= 0xff. */
2285 for (this_char = range_start; this_char <= range_end; this_char++)
2286 {
2287 SET_LIST_BIT (TRANSLATE (this_char));
2288 }
2289
2290 return REG_NOERROR;
2291}
2292\f
2293/* Failure stack declarations and macros; both re_compile_fastmap and
2294 re_match_2 use a failure stack. These have to be macros because of
2295 REGEX_ALLOCATE. */
2296
2297
2298/* Number of failure points for which to initially allocate space
2299 when matching. If this number is exceeded, we allocate more
2300 space, so it is not a hard limit. */
2301#ifndef INIT_FAILURE_ALLOC
2302#define INIT_FAILURE_ALLOC 5
2303#endif
2304
2305/* Roughly the maximum number of failure points on the stack. Would be
2306 exactly that if always used MAX_FAILURE_SPACE each time we failed.
2307 This is a variable only so users of regex can assign to it; we never
2308 change it ourselves. */
2309int re_max_failures = 2000;
2310
2311typedef const unsigned char *fail_stack_elt_t;
2312
2313typedef struct
2314{
2315 fail_stack_elt_t *stack;
2316 unsigned size;
2317 unsigned avail; /* Offset of next open position. */
2318} fail_stack_type;
2319
2320#define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
2321#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
2322#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
2323#define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail])
2324
2325
2326/* Initialize `fail_stack'. Do `return -2' if the alloc fails. */
2327
2328#define INIT_FAIL_STACK() \
2329 do { \
2330 fail_stack.stack = (fail_stack_elt_t *) \
2331 REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
2332 \
2333 if (fail_stack.stack == NULL) \
2334 return -2; \
2335 \
2336 fail_stack.size = INIT_FAILURE_ALLOC; \
2337 fail_stack.avail = 0; \
2338 } while (0)
2339
2340
2341/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
2342
2343 Return 1 if succeeds, and 0 if either ran out of memory
2344 allocating space for it or it was already too large.
2345
2346 REGEX_REALLOCATE requires `destination' be declared. */
2347
2348#define DOUBLE_FAIL_STACK(fail_stack) \
2349 ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
2350 ? 0 \
2351 : ((fail_stack).stack = (fail_stack_elt_t *) \
2352 REGEX_REALLOCATE ((fail_stack).stack, \
2353 (fail_stack).size * sizeof (fail_stack_elt_t), \
2354 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
2355 \
2356 (fail_stack).stack == NULL \
2357 ? 0 \
2358 : ((fail_stack).size <<= 1, \
2359 1)))
2360
2361
2362/* Push PATTERN_OP on FAIL_STACK.
2363
2364 Return 1 if was able to do so and 0 if ran out of memory allocating
2365 space to do so. */
2366#define PUSH_PATTERN_OP(pattern_op, fail_stack) \
2367 ((FAIL_STACK_FULL () \
2368 && !DOUBLE_FAIL_STACK (fail_stack)) \
2369 ? 0 \
2370 : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \
2371 1))
2372
2373/* This pushes an item onto the failure stack. Must be a four-byte
2374 value. Assumes the variable `fail_stack'. Probably should only
2375 be called from within `PUSH_FAILURE_POINT'. */
2376#define PUSH_FAILURE_ITEM(item) \
2377 fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item
2378
2379/* The complement operation. Assumes `fail_stack' is nonempty. */
2380#define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail]
2381
2382/* Used to omit pushing failure point id's when we're not debugging. */
2383#ifdef DEBUG
2384#define DEBUG_PUSH PUSH_FAILURE_ITEM
2385#define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM ()
2386#else
2387#define DEBUG_PUSH(item)
2388#define DEBUG_POP(item_addr)
2389#endif
2390
2391
2392/* Push the information about the state we will need
2393 if we ever fail back to it.
2394
2395 Requires variables fail_stack, regstart, regend, reg_info, and
2396 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
2397 declared.
2398
2399 Does `return FAILURE_CODE' if runs out of memory. */
2400
2401#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
2402 do { \
2403 DESTINATION_DECL \
2404 /* Must be int, so when we don't save any registers, the arithmetic \
2405 of 0 + -1 isn't done as unsigned. */ \
2406 int this_reg; \
2407 \
2408 DEBUG_STATEMENT (failure_id++); \
2409 DEBUG_STATEMENT (nfailure_points_pushed++); \
2410 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
2411 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\
2412 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\
2413 \
2414 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
2415 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \
2416 \
2417 /* Ensure we have enough space allocated for what we will push. */ \
2418 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \
2419 { \
2420 if (!DOUBLE_FAIL_STACK (fail_stack)) \
2421 return failure_code; \
2422 \
2423 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \
2424 (fail_stack).size); \
2425 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\
2426 } \
2427 \
2428 /* Push the info, starting with the registers. */ \
2429 DEBUG_PRINT1 ("\n"); \
2430 \
2431 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
2432 this_reg++) \
2433 { \
2434 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
2435 DEBUG_STATEMENT (num_regs_pushed++); \
2436 \
2437 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2438 PUSH_FAILURE_ITEM (regstart[this_reg]); \
2439 \
2440 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2441 PUSH_FAILURE_ITEM (regend[this_reg]); \
2442 \
2443 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \
2444 DEBUG_PRINT2 (" match_null=%d", \
2445 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \
2446 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \
2447 DEBUG_PRINT2 (" matched_something=%d", \
2448 MATCHED_SOMETHING (reg_info[this_reg])); \
2449 DEBUG_PRINT2 (" ever_matched=%d", \
2450 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \
2451 DEBUG_PRINT1 ("\n"); \
2452 PUSH_FAILURE_ITEM (reg_info[this_reg].word); \
2453 } \
2454 \
2455 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\
2456 PUSH_FAILURE_ITEM (lowest_active_reg); \
2457 \
2458 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\
2459 PUSH_FAILURE_ITEM (highest_active_reg); \
2460 \
2461 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \
2462 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \
2463 PUSH_FAILURE_ITEM (pattern_place); \
2464 \
2465 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \
2466 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \
2467 size2); \
2468 DEBUG_PRINT1 ("'\n"); \
2469 PUSH_FAILURE_ITEM (string_place); \
2470 \
2471 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \
2472 DEBUG_PUSH (failure_id); \
2473 } while (0)
2474
2475/* This is the number of items that are pushed and popped on the stack
2476 for each register. */
2477#define NUM_REG_ITEMS 3
2478
2479/* Individual items aside from the registers. */
2480#ifdef DEBUG
2481#define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
2482#else
2483#define NUM_NONREG_ITEMS 4
2484#endif
2485
2486/* We push at most this many items on the stack. */
2487#define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
2488
2489/* We actually push this many items. */
2490#define NUM_FAILURE_ITEMS \
2491 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
2492 + NUM_NONREG_ITEMS)
2493
2494/* How many items can still be added to the stack without overflowing it. */
2495#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
2496
2497
2498/* Pops what PUSH_FAIL_STACK pushes.
2499
2500 We restore into the parameters, all of which should be lvalues:
2501 STR -- the saved data position.
2502 PAT -- the saved pattern position.
2503 LOW_REG, HIGH_REG -- the highest and lowest active registers.
2504 REGSTART, REGEND -- arrays of string positions.
2505 REG_INFO -- array of information about each subexpression.
2506
2507 Also assumes the variables `fail_stack' and (if debugging), `bufp',
2508 `pend', `string1', `size1', `string2', and `size2'. */
2509
2510#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\
2511{ \
2512 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \
2513 int this_reg; \
2514 const unsigned char *string_temp; \
2515 \
2516 assert (!FAIL_STACK_EMPTY ()); \
2517 \
2518 /* Remove failure points and point to how many regs pushed. */ \
2519 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
2520 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \
2521 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \
2522 \
2523 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
2524 \
2525 DEBUG_POP (&failure_id); \
2526 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \
2527 \
2528 /* If the saved string location is NULL, it came from an \
2529 on_failure_keep_string_jump opcode, and we want to throw away the \
2530 saved NULL, thus retaining our current position in the string. */ \
2531 string_temp = POP_FAILURE_ITEM (); \
2532 if (string_temp != NULL) \
2533 str = (const char *) string_temp; \
2534 \
2535 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \
2536 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \
2537 DEBUG_PRINT1 ("'\n"); \
2538 \
2539 pat = (unsigned char *) POP_FAILURE_ITEM (); \
2540 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \
2541 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
2542 \
2543 /* Restore register info. */ \
2544 high_reg = (unsigned) POP_FAILURE_ITEM (); \
2545 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
2546 \
2547 low_reg = (unsigned) POP_FAILURE_ITEM (); \
2548 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
2549 \
2550 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \
2551 { \
2552 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
2553 \
2554 reg_info[this_reg].word = POP_FAILURE_ITEM (); \
2555 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \
2556 \
2557 regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2558 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \
2559 \
2560 regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \
2561 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \
2562 } \
2563 \
2564 DEBUG_STATEMENT (nfailure_points_popped++); \
2565} /* POP_FAILURE_POINT */
2566\f
2567/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
2568 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
2569 characters can start a string that matches the pattern. This fastmap
2570 is used by re_search to skip quickly over impossible starting points.
2571
2572 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
2573 area as BUFP->fastmap.
2574
2575 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
2576 the pattern buffer.
2577
2578 Returns 0 if we succeed, -2 if an internal error. */
2579
2580int
2581re_compile_fastmap (bufp)
2582 struct re_pattern_buffer *bufp;
2583{
2584 int j, k;
2585 fail_stack_type fail_stack;
2586#ifndef REGEX_MALLOC
2587 char *destination;
2588#endif
2589 /* We don't push any register information onto the failure stack. */
2590 unsigned num_regs = 0;
2591
2592 register char *fastmap = bufp->fastmap;
2593 unsigned char *pattern = bufp->buffer;
2594 unsigned long size = bufp->used;
2595 const unsigned char *p = pattern;
2596 register unsigned char *pend = pattern + size;
2597
2598 /* Assume that each path through the pattern can be null until
2599 proven otherwise. We set this false at the bottom of switch
2600 statement, to which we get only if a particular path doesn't
2601 match the empty string. */
2602 boolean path_can_be_null = true;
2603
2604 /* We aren't doing a `succeed_n' to begin with. */
2605 boolean succeed_n_p = false;
2606
2607 assert (fastmap != NULL && p != NULL);
2608
2609 INIT_FAIL_STACK ();
2610 bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
2611 bufp->fastmap_accurate = 1; /* It will be when we're done. */
2612 bufp->can_be_null = 0;
2613
2614 while (p != pend || !FAIL_STACK_EMPTY ())
2615 {
2616 if (p == pend)
2617 {
2618 bufp->can_be_null |= path_can_be_null;
2619
2620 /* Reset for next path. */
2621 path_can_be_null = true;
2622
2623 p = fail_stack.stack[--fail_stack.avail];
2624 }
2625
2626 /* We should never be about to go beyond the end of the pattern. */
2627 assert (p < pend);
2628
2629#ifdef SWITCH_ENUM_BUG
2630 switch ((int) ((re_opcode_t) *p++))
2631#else
2632 switch ((re_opcode_t) *p++)
2633#endif
2634 {
2635
2636 /* I guess the idea here is to simply not bother with a fastmap
2637 if a backreference is used, since it's too hard to figure out
2638 the fastmap for the corresponding group. Setting
2639 `can_be_null' stops `re_search_2' from using the fastmap, so
2640 that is all we do. */
2641 case duplicate:
2642 bufp->can_be_null = 1;
2643 return 0;
2644
2645
2646 /* Following are the cases which match a character. These end
2647 with `break'. */
2648
2649 case exactn:
2650 fastmap[p[1]] = 1;
2651 break;
2652
2653
2654 case charset:
2655 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2656 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
2657 fastmap[j] = 1;
2658 break;
2659
2660
2661 case charset_not:
2662 /* Chars beyond end of map must be allowed. */
2663 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
2664 fastmap[j] = 1;
2665
2666 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
2667 if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
2668 fastmap[j] = 1;
2669 break;
2670
2671
2672 case wordchar:
2673 for (j = 0; j < (1 << BYTEWIDTH); j++)
2674 if (SYNTAX (j) == Sword)
2675 fastmap[j] = 1;
2676 break;
2677
2678
2679 case notwordchar:
2680 for (j = 0; j < (1 << BYTEWIDTH); j++)
2681 if (SYNTAX (j) != Sword)
2682 fastmap[j] = 1;
2683 break;
2684
2685
2686 case anychar:
2687 /* `.' matches anything ... */
2688 for (j = 0; j < (1 << BYTEWIDTH); j++)
2689 fastmap[j] = 1;
2690
2691 /* ... except perhaps newline. */
2692 if (!(bufp->syntax & RE_DOT_NEWLINE))
2693 fastmap['\n'] = 0;
2694
2695 /* Return if we have already set `can_be_null'; if we have,
2696 then the fastmap is irrelevant. Something's wrong here. */
2697 else if (bufp->can_be_null)
2698 return 0;
2699
2700 /* Otherwise, have to check alternative paths. */
2701 break;
2702
2703
2704#ifdef emacs
2705 case syntaxspec:
2706 k = *p++;
2707 for (j = 0; j < (1 << BYTEWIDTH); j++)
2708 if (SYNTAX (j) == (enum syntaxcode) k)
2709 fastmap[j] = 1;
2710 break;
2711
2712
2713 case notsyntaxspec:
2714 k = *p++;
2715 for (j = 0; j < (1 << BYTEWIDTH); j++)
2716 if (SYNTAX (j) != (enum syntaxcode) k)
2717 fastmap[j] = 1;
2718 break;
2719
2720
2721 /* All cases after this match the empty string. These end with
2722 `continue'. */
2723
2724
2725 case before_dot:
2726 case at_dot:
2727 case after_dot:
2728 continue;
2729#endif /* not emacs */
2730
2731
2732 case no_op:
2733 case begline:
2734 case endline:
2735 case begbuf:
2736 case endbuf:
2737 case wordbound:
2738 case notwordbound:
2739 case wordbeg:
2740 case wordend:
2741 case push_dummy_failure:
2742 continue;
2743
2744
2745 case jump_n:
2746 case pop_failure_jump:
2747 case maybe_pop_jump:
2748 case jump:
2749 case jump_past_alt:
2750 case dummy_failure_jump:
2751 EXTRACT_NUMBER_AND_INCR (j, p);
2752 p += j;
2753 if (j > 0)
2754 continue;
2755
2756 /* Jump backward implies we just went through the body of a
2757 loop and matched nothing. Opcode jumped to should be
2758 `on_failure_jump' or `succeed_n'. Just treat it like an
2759 ordinary jump. For a * loop, it has pushed its failure
2760 point already; if so, discard that as redundant. */
2761 if ((re_opcode_t) *p != on_failure_jump
2762 && (re_opcode_t) *p != succeed_n)
2763 continue;
2764
2765 p++;
2766 EXTRACT_NUMBER_AND_INCR (j, p);
2767 p += j;
2768
2769 /* If what's on the stack is where we are now, pop it. */
2770 if (!FAIL_STACK_EMPTY ()
2771 && fail_stack.stack[fail_stack.avail - 1] == p)
2772 fail_stack.avail--;
2773
2774 continue;
2775
2776
2777 case on_failure_jump:
2778 case on_failure_keep_string_jump:
2779 handle_on_failure_jump:
2780 EXTRACT_NUMBER_AND_INCR (j, p);
2781
2782 /* For some patterns, e.g., `(a?)?', `p+j' here points to the
2783 end of the pattern. We don't want to push such a point,
2784 since when we restore it above, entering the switch will
2785 increment `p' past the end of the pattern. We don't need
2786 to push such a point since we obviously won't find any more
2787 fastmap entries beyond `pend'. Such a pattern can match
2788 the null string, though. */
2789 if (p + j < pend)
2790 {
2791 /*
2792 * WARN 191: illegal pointer conversion
2793 * between CONST and NON-CONST attribute
2794 */
2795# ifdef __TANDEM
2796# pragma NOWARN( 191 )
2797# endif
2798 if (!PUSH_PATTERN_OP (p + j, fail_stack))
2799 return -2;
2800# ifdef __TANDEM
2801# pragma WARN( 191 )
2802# endif
2803 }
2804 else
2805 bufp->can_be_null = 1;
2806
2807 if (succeed_n_p)
2808 {
2809 EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */
2810 succeed_n_p = false;
2811 }
2812
2813 continue;
2814
2815
2816 case succeed_n:
2817 /* Get to the number of times to succeed. */
2818 p += 2;
2819
2820 /* Increment p past the n for when k != 0. */
2821 EXTRACT_NUMBER_AND_INCR (k, p);
2822 if (k == 0)
2823 {
2824 p -= 4;
2825 succeed_n_p = true; /* Spaghetti code alert. */
2826 goto handle_on_failure_jump;
2827 }
2828 continue;
2829
2830
2831 case set_number_at:
2832 p += 4;
2833 continue;
2834
2835
2836 case start_memory:
2837 case stop_memory:
2838 p += 2;
2839 continue;
2840
2841
2842 default:
2843 abort (); /* We have listed all the cases. */
2844 } /* switch *p++ */
2845
2846 /* Getting here means we have found the possible starting
2847 characters for one path of the pattern -- and that the empty
2848 string does not match. We need not follow this path further.
2849 Instead, look at the next alternative (remembered on the
2850 stack), or quit if no more. The test at the top of the loop
2851 does these things. */
2852 path_can_be_null = false;
2853 p = pend;
2854 } /* while p */
2855
2856 /* Set `can_be_null' for the last path (also the first path, if the
2857 pattern is empty). */
2858 bufp->can_be_null |= path_can_be_null;
2859 return 0;
2860} /* re_compile_fastmap */
2861\f
2862/* Set REGS to hold NUM_REGS registers, storing them in STARTS and
2863 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
2864 this memory for recording register information. STARTS and ENDS
2865 must be allocated using the malloc library routine, and must each
2866 be at least NUM_REGS * sizeof (regoff_t) bytes long.
2867
2868 If NUM_REGS == 0, then subsequent matches should allocate their own
2869 register data.
2870
2871 Unless this function is called, the first search or match using
2872 PATTERN_BUFFER will allocate its own register data, without
2873 freeing the old data. */
2874
2875void
2876re_set_registers (bufp, regs, num_regs, starts, ends)
2877 struct re_pattern_buffer *bufp;
2878 struct re_registers *regs;
2879 unsigned num_regs;
2880 regoff_t *starts, *ends;
2881{
2882 if (num_regs)
2883 {
2884 bufp->regs_allocated = REGS_REALLOCATE;
2885 regs->num_regs = num_regs;
2886 regs->start = starts;
2887 regs->end = ends;
2888 }
2889 else
2890 {
2891 bufp->regs_allocated = REGS_UNALLOCATED;
2892 regs->num_regs = 0;
2893 regs->start = regs->end = (regoff_t) 0;
2894 }
2895}
2896\f
2897/* Searching routines. */
2898
2899/* Like re_search_2, below, but only one string is specified, and
2900 doesn't let you say where to stop matching. */
2901
2902int
2903re_search (bufp, string, size, startpos, range, regs)
2904 struct re_pattern_buffer *bufp;
2905 const char *string;
2906 int size, startpos, range;
2907 struct re_registers *regs;
2908{
2909 return re_search_2 (bufp, NULL, 0, string, size, startpos, range,
2910 regs, size);
2911}
2912
2913
2914/* Using the compiled pattern in BUFP->buffer, first tries to match the
2915 virtual concatenation of STRING1 and STRING2, starting first at index
2916 STARTPOS, then at STARTPOS + 1, and so on.
2917
2918 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
2919
2920 RANGE is how far to scan while trying to match. RANGE = 0 means try
2921 only at STARTPOS; in general, the last start tried is STARTPOS +
2922 RANGE.
2923
2924 In REGS, return the indices of the virtual concatenation of STRING1
2925 and STRING2 that matched the entire BUFP->buffer and its contained
2926 subexpressions.
2927
2928 Do not consider matching one past the index STOP in the virtual
2929 concatenation of STRING1 and STRING2.
2930
2931 We return either the position in the strings at which the match was
2932 found, -1 if no match, or -2 if error (such as failure
2933 stack overflow). */
2934
2935int
2936re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop)
2937 struct re_pattern_buffer *bufp;
2938 const char *string1, *string2;
2939 int size1, size2;
2940 int startpos;
2941 int range;
2942 struct re_registers *regs;
2943 int stop;
2944{
2945 int val;
2946 register char *fastmap = bufp->fastmap;
2947 register char *translate = bufp->translate;
2948 int total_size = size1 + size2;
2949 int endpos = startpos + range;
2950
2951 /* Check for out-of-range STARTPOS. */
2952 if (startpos < 0 || startpos > total_size)
2953 return -1;
2954
2955 /* Fix up RANGE if it might eventually take us outside
2956 the virtual concatenation of STRING1 and STRING2. */
2957 if (endpos < -1)
2958 range = -1 - startpos;
2959 else if (endpos > total_size)
2960 range = total_size - startpos;
2961
2962 /* If the search isn't to be a backwards one, don't waste time in a
2963 search for a pattern that must be anchored. */
2964 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0)
2965 {
2966 if (startpos > 0)
2967 return -1;
2968 else
2969 range = 1;
2970 }
2971
2972 /* Update the fastmap now if not correct already. */
2973 if (fastmap && !bufp->fastmap_accurate)
2974 if (re_compile_fastmap (bufp) == -2)
2975 return -2;
2976
2977 /* Loop through the string, looking for a place to start matching. */
2978 for (;;)
2979 {
2980 /* If a fastmap is supplied, skip quickly over characters that
2981 cannot be the start of a match. If the pattern can match the
2982 null string, however, we don't need to skip characters; we want
2983 the first null string. */
2984 if (fastmap && startpos < total_size && !bufp->can_be_null)
2985 {
2986 if (range > 0) /* Searching forwards. */
2987 {
2988 register const char *d;
2989 register int lim = 0;
2990 int irange = range;
2991
2992 if (startpos < size1 && startpos + range >= size1)
2993 lim = range - (size1 - startpos);
2994
2995 d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
2996
2997 /* Written out as an if-else to avoid testing `translate'
2998 inside the loop. */
2999 if (translate)
3000 while (range > lim
3001 && !fastmap[(unsigned char)
3002 translate[(unsigned char) *d++]])
3003 range--;
3004 else
3005 while (range > lim && !fastmap[(unsigned char) *d++])
3006 range--;
3007
3008 startpos += irange - range;
3009 }
3010 else /* Searching backwards. */
3011 {
3012 register char c = (size1 == 0 || startpos >= size1
3013 ? string2[startpos - size1]
3014 : string1[startpos]);
3015
3016 if (!fastmap[(unsigned char) TRANSLATE (c)])
3017 goto advance;
3018 }
3019 }
3020
3021 /* If can't match the null string, and that's all we have left, fail. */
3022 if (range >= 0 && startpos == total_size && fastmap
3023 && !bufp->can_be_null)
3024 return -1;
3025
3026 val = re_match_2 (bufp, string1, size1, string2, size2,
3027 startpos, regs, stop);
3028 if (val >= 0)
3029 return startpos;
3030
3031 if (val == -2)
3032 return -2;
3033
3034 advance:
3035 if (!range)
3036 break;
3037 else if (range > 0)
3038 {
3039 range--;
3040 startpos++;
3041 }
3042 else
3043 {
3044 range++;
3045 startpos--;
3046 }
3047 }
3048 return -1;
3049} /* re_search_2 */
3050\f
3051/* Declarations and macros for re_match_2. */
3052
3053static int bcmp_translate ();
3054static boolean alt_match_null_string_p (),
3055 common_op_match_null_string_p (),
3056 group_match_null_string_p ();
3057
3058/* Structure for per-register (a.k.a. per-group) information.
3059 This must not be longer than one word, because we push this value
3060 onto the failure stack. Other register information, such as the
3061 starting and ending positions (which are addresses), and the list of
3062 inner groups (which is a bits list) are maintained in separate
3063 variables.
3064
3065 We are making a (strictly speaking) nonportable assumption here: that
3066 the compiler will pack our bit fields into something that fits into
3067 the type of `word', i.e., is something that fits into one item on the
3068 failure stack. */
3069typedef union
3070{
3071 fail_stack_elt_t word;
3072 struct
3073 {
3074 /* This field is one if this group can match the empty string,
3075 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
3076#define MATCH_NULL_UNSET_VALUE 3
3077 unsigned match_null_string_p : 2;
3078 unsigned is_active : 1;
3079 unsigned matched_something : 1;
3080 unsigned ever_matched_something : 1;
3081 } bits;
3082} register_info_type;
3083
3084#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
3085#define IS_ACTIVE(R) ((R).bits.is_active)
3086#define MATCHED_SOMETHING(R) ((R).bits.matched_something)
3087#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
3088
3089
3090/* Call this when have matched a real character; it sets `matched' flags
3091 for the subexpressions which we are currently inside. Also records
3092 that those subexprs have matched. */
3093#define SET_REGS_MATCHED() \
3094 do \
3095 { \
3096 unsigned r; \
3097 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
3098 { \
3099 MATCHED_SOMETHING (reg_info[r]) \
3100 = EVER_MATCHED_SOMETHING (reg_info[r]) \
3101 = 1; \
3102 } \
3103 } \
3104 while (0)
3105
3106
3107/* This converts PTR, a pointer into one of the search strings `string1'
3108 and `string2' into an offset from the beginning of that string. */
3109#define POINTER_TO_OFFSET(ptr) \
3110 (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1)
3111
3112/* Registers are set to a sentinel when they haven't yet matched. */
3113#define REG_UNSET_VALUE ((char *) -1)
3114#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
3115
3116
3117/* Macros for dealing with the split strings in re_match_2. */
3118
3119#define MATCHING_IN_FIRST_STRING (dend == end_match_1)
3120
3121/* Call before fetching a character with *d. This switches over to
3122 string2 if necessary. */
3123#define PREFETCH() \
3124 while (d == dend) \
3125 { \
3126 /* End of string2 => fail. */ \
3127 if (dend == end_match_2) \
3128 goto fail; \
3129 /* End of string1 => advance to string2. */ \
3130 d = string2; \
3131 dend = end_match_2; \
3132 }
3133
3134
3135/* Test if at very beginning or at very end of the virtual concatenation
3136 of `string1' and `string2'. If only one string, it's `string2'. */
3137#define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
3138#define AT_STRINGS_END(d) ((d) == end2)
3139
3140
3141/* Test if D points to a character which is word-constituent. We have
3142 two special cases to check for: if past the end of string1, look at
3143 the first character in string2; and if before the beginning of
3144 string2, look at the last character in string1. */
3145#define WORDCHAR_P(d) \
3146 (SYNTAX ((d) == end1 ? *string2 \
3147 : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \
3148 == Sword)
3149
3150/* Test if the character before D and the one at D differ with respect
3151 to being word-constituent. */
3152#define AT_WORD_BOUNDARY(d) \
3153 (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \
3154 || WORDCHAR_P (d - 1) != WORDCHAR_P (d))
3155
3156
3157/* Free everything we malloc. */
3158#ifdef REGEX_MALLOC
3159#define FREE_VAR(var) do{if(var){free((void*)var); var = NULL;}}while (0)
3160#define FREE_VARIABLES() \
3161 do { \
3162 FREE_VAR (fail_stack.stack); \
3163 FREE_VAR (regstart); \
3164 FREE_VAR (regend); \
3165 FREE_VAR (old_regstart); \
3166 FREE_VAR (old_regend); \
3167 FREE_VAR (best_regstart); \
3168 FREE_VAR (best_regend); \
3169 FREE_VAR (reg_info); \
3170 FREE_VAR (reg_dummy); \
3171 FREE_VAR (reg_info_dummy); \
3172 } while (0)
3173#else /* not REGEX_MALLOC */
3174/* Some MIPS systems (at least) want this to free alloca'd storage. */
3175#define FREE_VARIABLES() alloca (0)
3176#endif /* not REGEX_MALLOC */
3177
3178
3179/* These values must meet several constraints. They must not be valid
3180 register values; since we have a limit of 255 registers (because
3181 we use only one byte in the pattern for the register number), we can
3182 use numbers larger than 255. They must differ by 1, because of
3183 NUM_FAILURE_ITEMS above. And the value for the lowest register must
3184 be larger than the value for the highest register, so we do not try
3185 to actually save any registers when none are active. */
3186#define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
3187#define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
3188\f
3189/* Matching routines. */
3190
3191#ifndef emacs /* Emacs never uses this. */
3192/* re_match is like re_match_2 except it takes only a single string. */
3193
3194int
3195re_match (bufp, string, size, pos, regs)
3196 struct re_pattern_buffer *bufp;
3197 const char *string;
3198 int size, pos;
3199 struct re_registers *regs;
3200 {
3201 return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size);
3202}
3203#endif /* not emacs */
3204
3205
3206/* re_match_2 matches the compiled pattern in BUFP against the
38e01259 3207 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1
99024956
BK
3208 and SIZE2, respectively). We start matching at POS, and stop
3209 matching at STOP.
3210
3211 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
3212 store offsets for the substring each group matched in REGS. See the
3213 documentation for exactly how many groups we fill.
3214
3215 We return -1 if no match, -2 if an internal error (such as the
3216 failure stack overflowing). Otherwise, we return the length of the
3217 matched substring. */
3218
3219int
3220re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop)
3221 struct re_pattern_buffer *bufp;
3222 const char *string1, *string2;
3223 int size1, size2;
3224 int pos;
3225 struct re_registers *regs;
3226 int stop;
3227{
3228 /* General temporaries. */
3229 int mcnt;
3230 unsigned char *p1;
3231
3232 /* Just past the end of the corresponding string. */
3233 const char *end1, *end2;
3234
3235 /* Pointers into string1 and string2, just past the last characters in
3236 each to consider matching. */
3237 const char *end_match_1, *end_match_2;
3238
3239 /* Where we are in the data, and the end of the current string. */
3240 const char *d, *dend;
3241
3242 /* Where we are in the pattern, and the end of the pattern. */
3243 unsigned char *p = bufp->buffer;
3244 register unsigned char *pend = p + bufp->used;
3245
3246 /* We use this to map every character in the string. */
3247 char *translate = bufp->translate;
3248
3249 /* Failure point stack. Each place that can handle a failure further
3250 down the line pushes a failure point on this stack. It consists of
3251 restart, regend, and reg_info for all registers corresponding to
3252 the subexpressions we're currently inside, plus the number of such
3253 registers, and, finally, two char *'s. The first char * is where
3254 to resume scanning the pattern; the second one is where to resume
3255 scanning the strings. If the latter is zero, the failure point is
3256 a ``dummy''; if a failure happens and the failure point is a dummy,
38e01259 3257 it gets discarded and the next one is tried. */
99024956
BK
3258 fail_stack_type fail_stack;
3259#ifdef DEBUG
3260 static unsigned failure_id = 0;
3261 unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
3262#endif
3263
3264 /* We fill all the registers internally, independent of what we
3265 return, for use in backreferences. The number here includes
3266 an element for register zero. */
3267 unsigned num_regs = bufp->re_nsub + 1;
3268
3269 /* The currently active registers. */
3270 unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3271 unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3272
3273 /* Information on the contents of registers. These are pointers into
3274 the input strings; they record just what was matched (on this
3275 attempt) by a subexpression part of the pattern, that is, the
3276 regnum-th regstart pointer points to where in the pattern we began
3277 matching and the regnum-th regend points to right after where we
3278 stopped matching the regnum-th subexpression. (The zeroth register
3279 keeps track of what the whole pattern matches.) */
3280 const char **regstart, **regend;
3281
3282 /* If a group that's operated upon by a repetition operator fails to
3283 match anything, then the register for its start will need to be
3284 restored because it will have been set to wherever in the string we
3285 are when we last see its open-group operator. Similarly for a
3286 register's end. */
3287 const char **old_regstart, **old_regend;
3288
3289 /* The is_active field of reg_info helps us keep track of which (possibly
3290 nested) subexpressions we are currently in. The matched_something
3291 field of reg_info[reg_num] helps us tell whether or not we have
3292 matched any of the pattern so far this time through the reg_num-th
3293 subexpression. These two fields get reset each time through any
3294 loop their register is in. */
3295 register_info_type *reg_info;
3296
3297 /* The following record the register info as found in the above
3298 variables when we find a match better than any we've seen before.
3299 This happens as we backtrack through the failure points, which in
3300 turn happens only if we have not yet matched the entire string. */
3301 unsigned best_regs_set = false;
3302 const char **best_regstart, **best_regend;
3303
3304 /* Logically, this is `best_regend[0]'. But we don't want to have to
3305 allocate space for that if we're not allocating space for anything
3306 else (see below). Also, we never need info about register 0 for
3307 any of the other register vectors, and it seems rather a kludge to
3308 treat `best_regend' differently than the rest. So we keep track of
3309 the end of the best match so far in a separate variable. We
3310 initialize this to NULL so that when we backtrack the first time
3311 and need to test it, it's not garbage. */
3312 const char *match_end = NULL;
3313
3314 /* Used when we pop values we don't care about. */
3315 const char **reg_dummy;
3316 register_info_type *reg_info_dummy;
3317
3318#ifdef DEBUG
3319 /* Counts the total number of registers pushed. */
3320 unsigned num_regs_pushed = 0;
3321#endif
3322
3323 DEBUG_PRINT1 ("\n\nEntering re_match_2.\n");
3324
3325 INIT_FAIL_STACK ();
3326
3327 /* Do not bother to initialize all the register variables if there are
3328 no groups in the pattern, as it takes a fair amount of time. If
3329 there are groups, we include space for register 0 (the whole
3330 pattern), even though we never use it, since it simplifies the
3331 array indexing. We should fix this. */
3332 if (bufp->re_nsub)
3333 {
3334 regstart = REGEX_TALLOC (num_regs, const char *);
3335 regend = REGEX_TALLOC (num_regs, const char *);
3336 old_regstart = REGEX_TALLOC (num_regs, const char *);
3337 old_regend = REGEX_TALLOC (num_regs, const char *);
3338 best_regstart = REGEX_TALLOC (num_regs, const char *);
3339 best_regend = REGEX_TALLOC (num_regs, const char *);
3340 reg_info = REGEX_TALLOC (num_regs, register_info_type);
3341 reg_dummy = REGEX_TALLOC (num_regs, const char *);
3342 reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
3343
3344 if (!(regstart && regend && old_regstart && old_regend && reg_info
3345 && best_regstart && best_regend && reg_dummy && reg_info_dummy))
3346 {
3347 FREE_VARIABLES ();
3348 return -2;
3349 }
3350 }
3351#ifdef REGEX_MALLOC
3352 else
3353 {
3354 /* We must initialize all our variables to NULL, so that
3355 `FREE_VARIABLES' doesn't try to free them. */
3356 regstart = regend = old_regstart = old_regend = best_regstart
3357 = best_regend = reg_dummy = NULL;
3358 reg_info = reg_info_dummy = (register_info_type *) NULL;
3359 }
3360#endif /* REGEX_MALLOC */
3361
3362 /* The starting position is bogus. */
3363 if (pos < 0 || pos > size1 + size2)
3364 {
3365 FREE_VARIABLES ();
3366 return -1;
3367 }
3368
3369 /* Initialize subexpression text positions to -1 to mark ones that no
3370 start_memory/stop_memory has been seen for. Also initialize the
3371 register information struct. */
3372 for (mcnt = 1; mcnt < num_regs; mcnt++)
3373 {
3374 regstart[mcnt] = regend[mcnt]
3375 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
3376
3377 REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
3378 IS_ACTIVE (reg_info[mcnt]) = 0;
3379 MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3380 EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
3381 }
3382
3383 /* We move `string1' into `string2' if the latter's empty -- but not if
3384 `string1' is null. */
3385 if (size2 == 0 && string1 != NULL)
3386 {
3387 string2 = string1;
3388 size2 = size1;
3389 string1 = 0;
3390 size1 = 0;
3391 }
3392 end1 = string1 + size1;
3393 end2 = string2 + size2;
3394
3395 /* Compute where to stop matching, within the two strings. */
3396 if (stop <= size1)
3397 {
3398 end_match_1 = string1 + stop;
3399 end_match_2 = string2;
3400 }
3401 else
3402 {
3403 end_match_1 = end1;
3404 end_match_2 = string2 + stop - size1;
3405 }
3406
3407 /* `p' scans through the pattern as `d' scans through the data.
3408 `dend' is the end of the input string that `d' points within. `d'
3409 is advanced into the following input string whenever necessary, but
3410 this happens before fetching; therefore, at the beginning of the
3411 loop, `d' can be pointing at the end of a string, but it cannot
3412 equal `string2'. */
3413 if (size1 > 0 && pos <= size1)
3414 {
3415 d = string1 + pos;
3416 dend = end_match_1;
3417 }
3418 else
3419 {
3420 d = string2 + pos - size1;
3421 dend = end_match_2;
3422 }
3423
3424 DEBUG_PRINT1 ("The compiled pattern is: ");
3425 DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend);
3426 DEBUG_PRINT1 ("The string to match is: `");
3427 DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2);
3428 DEBUG_PRINT1 ("'\n");
3429
3430 /* This loops over pattern commands. It exits by returning from the
3431 function if the match is complete, or it drops through if the match
3432 fails at this starting point in the input data. */
3433 for (;;)
3434 {
3435 DEBUG_PRINT2 ("\n0x%x: ", p);
3436
3437 if (p == pend)
3438 { /* End of pattern means we might have succeeded. */
3439 DEBUG_PRINT1 ("end of pattern ... ");
3440
3441 /* If we haven't matched the entire string, and we want the
3442 longest match, try backtracking. */
3443 if (d != end_match_2)
3444 {
3445 DEBUG_PRINT1 ("backtracking.\n");
3446
3447 if (!FAIL_STACK_EMPTY ())
3448 { /* More failure points to try. */
3449 boolean same_str_p = (FIRST_STRING_P (match_end)
3450 == MATCHING_IN_FIRST_STRING);
3451
3452 /* If exceeds best match so far, save it. */
3453 if (!best_regs_set
3454 || (same_str_p && d > match_end)
3455 || (!same_str_p && !MATCHING_IN_FIRST_STRING))
3456 {
3457 best_regs_set = true;
3458 match_end = d;
3459
3460 DEBUG_PRINT1 ("\nSAVING match as best so far.\n");
3461
3462 for (mcnt = 1; mcnt < num_regs; mcnt++)
3463 {
3464 best_regstart[mcnt] = regstart[mcnt];
3465 best_regend[mcnt] = regend[mcnt];
3466 }
3467 }
3468 goto fail;
3469 }
3470
3471 /* If no failure points, don't restore garbage. */
3472 else if (best_regs_set)
3473 {
3474 restore_best_regs:
3475 /* Restore best match. It may happen that `dend ==
3476 end_match_1' while the restored d is in string2.
3477 For example, the pattern `x.*y.*z' against the
3478 strings `x-' and `y-z-', if the two strings are
3479 not consecutive in memory. */
3480 DEBUG_PRINT1 ("Restoring best registers.\n");
3481
3482 d = match_end;
3483 dend = ((d >= string1 && d <= end1)
3484 ? end_match_1 : end_match_2);
3485
3486 for (mcnt = 1; mcnt < num_regs; mcnt++)
3487 {
3488 regstart[mcnt] = best_regstart[mcnt];
3489 regend[mcnt] = best_regend[mcnt];
3490 }
3491 }
3492 } /* d != end_match_2 */
3493
3494 DEBUG_PRINT1 ("Accepting match.\n");
3495
3496 /* If caller wants register contents data back, do it. */
3497 if (regs && !bufp->no_sub)
3498 {
3499 /* Have the register data arrays been allocated? */
3500 if (bufp->regs_allocated == REGS_UNALLOCATED)
3501 { /* No. So allocate them with malloc. We need one
3502 extra element beyond `num_regs' for the `-1' marker
3503 GNU code uses. */
3504 regs->num_regs = MAX (RE_NREGS, num_regs + 1);
3505 regs->start = TALLOC (regs->num_regs, regoff_t);
3506 regs->end = TALLOC (regs->num_regs, regoff_t);
3507 if (regs->start == NULL || regs->end == NULL)
3508 return -2;
3509 bufp->regs_allocated = REGS_REALLOCATE;
3510 }
3511 else if (bufp->regs_allocated == REGS_REALLOCATE)
3512 { /* Yes. If we need more elements than were already
3513 allocated, reallocate them. If we need fewer, just
3514 leave it alone. */
3515 if (regs->num_regs < num_regs + 1)
3516 {
3517 regs->num_regs = num_regs + 1;
3518 RETALLOC (regs->start, regs->num_regs, regoff_t);
3519 RETALLOC (regs->end, regs->num_regs, regoff_t);
3520 if (regs->start == NULL || regs->end == NULL)
3521 return -2;
3522 }
3523 }
3524 else
3525 assert (bufp->regs_allocated == REGS_FIXED);
3526
3527 /* Convert the pointer data in `regstart' and `regend' to
3528 indices. Register zero has to be set differently,
3529 since we haven't kept track of any info for it. */
3530 if (regs->num_regs > 0)
3531 {
3532 regs->start[0] = pos;
3533 regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1
3534 : d - string2 + size1);
3535 }
3536
3537 /* Go through the first `min (num_regs, regs->num_regs)'
3538 registers, since that is all we initialized. */
3539 for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++)
3540 {
3541 if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt]))
3542 regs->start[mcnt] = regs->end[mcnt] = -1;
3543 else
3544 {
3545 regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]);
3546 regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]);
3547 }
3548 }
3549
3550 /* If the regs structure we return has more elements than
3551 were in the pattern, set the extra elements to -1. If
3552 we (re)allocated the registers, this is the case,
3553 because we always allocate enough to have at least one
3554 -1 at the end. */
3555 for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++)
3556 regs->start[mcnt] = regs->end[mcnt] = -1;
3557 } /* regs && !bufp->no_sub */
3558
3559 FREE_VARIABLES ();
3560 DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n",
3561 nfailure_points_pushed, nfailure_points_popped,
3562 nfailure_points_pushed - nfailure_points_popped);
3563 DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed);
3564
3565 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
3566 ? string1
3567 : string2 - size1);
3568
3569 DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt);
3570
3571 return mcnt;
3572 }
3573
3574 /* Otherwise match next pattern command. */
3575#ifdef SWITCH_ENUM_BUG
3576 switch ((int) ((re_opcode_t) *p++))
3577#else
3578 switch ((re_opcode_t) *p++)
3579#endif
3580 {
3581 /* Ignore these. Used to ignore the n of succeed_n's which
3582 currently have n == 0. */
3583 case no_op:
3584 DEBUG_PRINT1 ("EXECUTING no_op.\n");
3585 break;
3586
3587
3588 /* Match the next n pattern characters exactly. The following
3589 byte in the pattern defines n, and the n bytes after that
3590 are the characters to match. */
3591 case exactn:
3592 mcnt = *p++;
3593 DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt);
3594
3595 /* This is written out as an if-else so we don't waste time
3596 testing `translate' inside the loop. */
3597 if (translate)
3598 {
3599 do
3600 {
3601 PREFETCH ();
3602 if (translate[(unsigned char) *d++] != (char) *p++)
3603 goto fail;
3604 }
3605 while (--mcnt);
3606 }
3607 else
3608 {
3609 do
3610 {
3611 PREFETCH ();
3612 if (*d++ != (char) *p++) goto fail;
3613 }
3614 while (--mcnt);
3615 }
3616 SET_REGS_MATCHED ();
3617 break;
3618
3619
3620 /* Match any character except possibly a newline or a null. */
3621 case anychar:
3622 DEBUG_PRINT1 ("EXECUTING anychar.\n");
3623
3624 PREFETCH ();
3625
3626 if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n')
3627 || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000'))
3628 goto fail;
3629
3630 SET_REGS_MATCHED ();
3631 DEBUG_PRINT2 (" Matched `%d'.\n", *d);
3632 d++;
3633 break;
3634
3635
3636 case charset:
3637 case charset_not:
3638 {
3639 register unsigned char c;
3640 boolean not = (re_opcode_t) *(p - 1) == charset_not;
3641
3642 DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : "");
3643
3644 PREFETCH ();
3645 c = TRANSLATE (*d); /* The character to match. */
3646
3647 /* Cast to `unsigned' instead of `unsigned char' in case the
3648 bit list is a full 32 bytes long. */
3649 if (c < (unsigned) (*p * BYTEWIDTH)
3650 && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
3651 not = !not;
3652
3653 p += 1 + *p;
3654
3655 if (!not) goto fail;
3656
3657 SET_REGS_MATCHED ();
3658 d++;
3659 break;
3660 }
3661
3662
3663 /* The beginning of a group is represented by start_memory.
3664 The arguments are the register number in the next byte, and the
3665 number of groups inner to this one in the next. The text
3666 matched within the group is recorded (in the internal
3667 registers data structure) under the register number. */
3668 case start_memory:
3669 DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]);
3670
3671 /* Find out if this group can match the empty string. */
3672 p1 = p; /* To send to group_match_null_string_p. */
3673
3674 if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
3675 REG_MATCH_NULL_STRING_P (reg_info[*p])
3676 = group_match_null_string_p (&p1, pend, reg_info);
3677
3678 /* Save the position in the string where we were the last time
3679 we were at this open-group operator in case the group is
3680 operated upon by a repetition operator, e.g., with `(a*)*b'
3681 against `ab'; then we want to ignore where we are now in
3682 the string in case this attempt to match fails. */
3683 old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3684 ? REG_UNSET (regstart[*p]) ? d : regstart[*p]
3685 : regstart[*p];
3686 DEBUG_PRINT2 (" old_regstart: %d\n",
3687 POINTER_TO_OFFSET (old_regstart[*p]));
3688
3689 regstart[*p] = d;
3690 DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p]));
3691
3692 IS_ACTIVE (reg_info[*p]) = 1;
3693 MATCHED_SOMETHING (reg_info[*p]) = 0;
3694
3695 /* This is the new highest active register. */
3696 highest_active_reg = *p;
3697
3698 /* If nothing was active before, this is the new lowest active
3699 register. */
3700 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
3701 lowest_active_reg = *p;
3702
3703 /* Move past the register number and inner group count. */
3704 p += 2;
3705 break;
3706
3707
3708 /* The stop_memory opcode represents the end of a group. Its
3709 arguments are the same as start_memory's: the register
3710 number, and the number of inner groups. */
3711 case stop_memory:
3712 DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]);
3713
3714 /* We need to save the string position the last time we were at
3715 this close-group operator in case the group is operated
3716 upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
3717 against `aba'; then we want to ignore where we are now in
3718 the string in case this attempt to match fails. */
3719 old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
3720 ? REG_UNSET (regend[*p]) ? d : regend[*p]
3721 : regend[*p];
3722 DEBUG_PRINT2 (" old_regend: %d\n",
3723 POINTER_TO_OFFSET (old_regend[*p]));
3724
3725 regend[*p] = d;
3726 DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p]));
3727
3728 /* This register isn't active anymore. */
3729 IS_ACTIVE (reg_info[*p]) = 0;
3730
3731 /* If this was the only register active, nothing is active
3732 anymore. */
3733 if (lowest_active_reg == highest_active_reg)
3734 {
3735 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3736 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3737 }
3738 else
3739 { /* We must scan for the new highest active register, since
3740 it isn't necessarily one less than now: consider
3741 (a(b)c(d(e)f)g). When group 3 ends, after the f), the
3742 new highest active register is 1. */
3743 unsigned char r = *p - 1;
3744 while (r > 0 && !IS_ACTIVE (reg_info[r]))
3745 r--;
3746
3747 /* If we end up at register zero, that means that we saved
3748 the registers as the result of an `on_failure_jump', not
3749 a `start_memory', and we jumped to past the innermost
3750 `stop_memory'. For example, in ((.)*) we save
3751 registers 1 and 2 as a result of the *, but when we pop
3752 back to the second ), we are at the stop_memory 1.
3753 Thus, nothing is active. */
3754 if (r == 0)
3755 {
3756 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
3757 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
3758 }
3759 else
3760 highest_active_reg = r;
3761 }
3762
3763 /* If just failed to match something this time around with a
3764 group that's operated on by a repetition operator, try to
3765 force exit from the ``loop'', and restore the register
3766 information for this group that we had before trying this
3767 last match. */
3768 if ((!MATCHED_SOMETHING (reg_info[*p])
3769 || (re_opcode_t) p[-3] == start_memory)
3770 && (p + 2) < pend)
3771 {
3772 boolean is_a_jump_n = false;
3773
3774 p1 = p + 2;
3775 mcnt = 0;
3776 switch ((re_opcode_t) *p1++)
3777 {
3778 case jump_n:
3779 is_a_jump_n = true;
3780 case pop_failure_jump:
3781 case maybe_pop_jump:
3782 case jump:
3783 case dummy_failure_jump:
3784 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3785 if (is_a_jump_n)
3786 p1 += 2;
3787 break;
3788
3789 default:
3790 /* do nothing */ ;
3791 }
3792 p1 += mcnt;
3793
3794 /* If the next operation is a jump backwards in the pattern
3795 to an on_failure_jump right before the start_memory
3796 corresponding to this stop_memory, exit from the loop
3797 by forcing a failure after pushing on the stack the
3798 on_failure_jump's jump in the pattern, and d. */
3799 if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
3800 && (re_opcode_t) p1[3] == start_memory && p1[4] == *p)
3801 {
3802 /* If this group ever matched anything, then restore
3803 what its registers were before trying this last
3804 failed match, e.g., with `(a*)*b' against `ab' for
3805 regstart[1], and, e.g., with `((a*)*(b*)*)*'
3806 against `aba' for regend[3].
3807
3808 Also restore the registers for inner groups for,
3809 e.g., `((a*)(b*))*' against `aba' (register 3 would
3810 otherwise get trashed). */
3811
3812 if (EVER_MATCHED_SOMETHING (reg_info[*p]))
3813 {
3814 unsigned r;
3815
3816 EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
3817
3818 /* Restore this and inner groups' (if any) registers. */
3819 for (r = *p; r < *p + *(p + 1); r++)
3820 {
3821 regstart[r] = old_regstart[r];
3822
3823 /* xx why this test? */
3824 if ((int) old_regend[r] >= (int) regstart[r])
3825 regend[r] = old_regend[r];
3826 }
3827 }
3828 p1++;
3829 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
3830 /*
3831 * WARN 191: illegal pointer conversion
3832 * between CONST and NON-CONST attribute
3833 */
3834# ifdef __TANDEM
3835# pragma NOWARN( 191 )
3836# endif
3837 PUSH_FAILURE_POINT (p1 + mcnt, d, -2);
3838# ifdef __TANDEM
3839# pragma WARN( 191 )
3840# endif
3841
3842 goto fail;
3843 }
3844 }
3845
3846 /* Move past the register number and the inner group count. */
3847 p += 2;
3848 break;
3849
3850
3851 /* \<digit> has been turned into a `duplicate' command which is
3852 followed by the numeric value of <digit> as the register number. */
3853 case duplicate:
3854 {
3855 register const char *d2, *dend2;
3856 int regno = *p++; /* Get which register to match against. */
3857 DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno);
3858
3859 /* Can't back reference a group which we've never matched. */
3860 if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno]))
3861 goto fail;
3862
3863 /* Where in input to try to start matching. */
3864 d2 = regstart[regno];
3865
3866 /* Where to stop matching; if both the place to start and
3867 the place to stop matching are in the same string, then
3868 set to the place to stop, otherwise, for now have to use
3869 the end of the first string. */
3870
3871 dend2 = ((FIRST_STRING_P (regstart[regno])
3872 == FIRST_STRING_P (regend[regno]))
3873 ? regend[regno] : end_match_1);
3874 for (;;)
3875 {
3876 /* If necessary, advance to next segment in register
3877 contents. */
3878 while (d2 == dend2)
3879 {
3880 if (dend2 == end_match_2) break;
3881 if (dend2 == regend[regno]) break;
3882
3883 /* End of string1 => advance to string2. */
3884 d2 = string2;
3885 dend2 = regend[regno];
3886 }
3887 /* At end of register contents => success */
3888 if (d2 == dend2) break;
3889
3890 /* If necessary, advance to next segment in data. */
3891 PREFETCH ();
3892
3893 /* How many characters left in this segment to match. */
3894 mcnt = dend - d;
3895
3896 /* Want how many consecutive characters we can match in
3897 one shot, so, if necessary, adjust the count. */
3898 if (mcnt > dend2 - d2)
3899 mcnt = dend2 - d2;
3900
3901 /* Compare that many; failure if mismatch, else move
3902 past them. */
3903 if (translate
3904 ? bcmp_translate (d, d2, mcnt, translate)
3905 : bcmp (d, d2, mcnt))
3906 goto fail;
3907 d += mcnt, d2 += mcnt;
3908 }
3909 }
3910 break;
3911
3912
3913 /* begline matches the empty string at the beginning of the string
3914 (unless `not_bol' is set in `bufp'), and, if
3915 `newline_anchor' is set, after newlines. */
3916 case begline:
3917 DEBUG_PRINT1 ("EXECUTING begline.\n");
3918
3919 if (AT_STRINGS_BEG (d))
3920 {
3921 if (!bufp->not_bol) break;
3922 }
3923 else if (d[-1] == '\n' && bufp->newline_anchor)
3924 {
3925 break;
3926 }
3927 /* In all other cases, we fail. */
3928 goto fail;
3929
3930
3931 /* endline is the dual of begline. */
3932 case endline:
3933 DEBUG_PRINT1 ("EXECUTING endline.\n");
3934
3935 if (AT_STRINGS_END (d))
3936 {
3937 if (!bufp->not_eol) break;
3938 }
3939
3940 /* We have to ``prefetch'' the next character. */
3941 else if ((d == end1 ? *string2 : *d) == '\n'
3942 && bufp->newline_anchor)
3943 {
3944 break;
3945 }
3946 goto fail;
3947
3948
3949 /* Match at the very beginning of the data. */
3950 case begbuf:
3951 DEBUG_PRINT1 ("EXECUTING begbuf.\n");
3952 if (AT_STRINGS_BEG (d))
3953 break;
3954 goto fail;
3955
3956
3957 /* Match at the very end of the data. */
3958 case endbuf:
3959 DEBUG_PRINT1 ("EXECUTING endbuf.\n");
3960 if (AT_STRINGS_END (d))
3961 break;
3962 goto fail;
3963
3964
3965 /* on_failure_keep_string_jump is used to optimize `.*\n'. It
3966 pushes NULL as the value for the string on the stack. Then
3967 `pop_failure_point' will keep the current value for the
3968 string, instead of restoring it. To see why, consider
3969 matching `foo\nbar' against `.*\n'. The .* matches the foo;
3970 then the . fails against the \n. But the next thing we want
3971 to do is match the \n against the \n; if we restored the
3972 string value, we would be back at the foo.
3973
3974 Because this is used only in specific cases, we don't need to
3975 check all the things that `on_failure_jump' does, to make
3976 sure the right things get saved on the stack. Hence we don't
3977 share its code. The only reason to push anything on the
3978 stack at all is that otherwise we would have to change
3979 `anychar's code to do something besides goto fail in this
3980 case; that seems worse than this. */
3981 case on_failure_keep_string_jump:
3982 DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump");
3983
3984 EXTRACT_NUMBER_AND_INCR (mcnt, p);
3985 DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt);
3986
3987 /*
3988 * WARN 191: illegal pointer conversion
3989 * between CONST and NON-CONST attribute
3990 */
3991# ifdef __TANDEM
3992# pragma NOWARN( 191 )
3993# endif
3994 PUSH_FAILURE_POINT (p + mcnt, NULL, -2);
3995# ifdef __TANDEM
3996# pragma WARN( 191 )
3997# endif
3998 break;
3999
4000
4001 /* Uses of on_failure_jump:
4002
4003 Each alternative starts with an on_failure_jump that points
4004 to the beginning of the next alternative. Each alternative
4005 except the last ends with a jump that in effect jumps past
4006 the rest of the alternatives. (They really jump to the
4007 ending jump of the following alternative, because tensioning
4008 these jumps is a hassle.)
4009
4010 Repeats start with an on_failure_jump that points past both
4011 the repetition text and either the following jump or
4012 pop_failure_jump back to this on_failure_jump. */
4013 case on_failure_jump:
4014 on_failure:
4015 DEBUG_PRINT1 ("EXECUTING on_failure_jump");
4016
4017 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4018 DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt);
4019
4020 /* If this on_failure_jump comes right before a group (i.e.,
4021 the original * applied to a group), save the information
4022 for that group and all inner ones, so that if we fail back
4023 to this point, the group's information will be correct.
4024 For example, in \(a*\)*\1, we need the preceding group,
4025 and in \(\(a*\)b*\)\2, we need the inner group. */
4026
4027 /* We can't use `p' to check ahead because we push
4028 a failure point to `p + mcnt' after we do this. */
4029 p1 = p;
4030
4031 /* We need to skip no_op's before we look for the
4032 start_memory in case this on_failure_jump is happening as
4033 the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
4034 against aba. */
4035 while (p1 < pend && (re_opcode_t) *p1 == no_op)
4036 p1++;
4037
4038 if (p1 < pend && (re_opcode_t) *p1 == start_memory)
4039 {
4040 /* We have a new highest active register now. This will
4041 get reset at the start_memory we are about to get to,
4042 but we will have saved all the registers relevant to
4043 this repetition op, as described above. */
4044 highest_active_reg = *(p1 + 1) + *(p1 + 2);
4045 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
4046 lowest_active_reg = *(p1 + 1);
4047 }
4048
4049 DEBUG_PRINT1 (":\n");
4050 /*
4051 * WARN 191: illegal pointer conversion
4052 * between CONST and NON-CONST attribute
4053 */
4054# ifdef __TANDEM
4055# pragma NOWARN( 191 )
4056# endif
4057 PUSH_FAILURE_POINT (p + mcnt, d, -2);
4058# ifdef __TANDEM
4059# pragma WARN( 191 )
4060# endif
4061 break;
4062
4063
4064 /* A smart repeat ends with `maybe_pop_jump'.
4065 We change it to either `pop_failure_jump' or `jump'. */
4066 case maybe_pop_jump:
4067 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4068 DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt);
4069 {
4070 register unsigned char *p2 = p;
4071
4072 /* Compare the beginning of the repeat with what in the
4073 pattern follows its end. If we can establish that there
4074 is nothing that they would both match, i.e., that we
4075 would have to backtrack because of (as in, e.g., `a*a')
4076 then we can change to pop_failure_jump, because we'll
4077 never have to backtrack.
4078
4079 This is not true in the case of alternatives: in
4080 `(a|ab)*' we do need to backtrack to the `ab' alternative
4081 (e.g., if the string was `ab'). But instead of trying to
4082 detect that here, the alternative has put on a dummy
4083 failure point which is what we will end up popping. */
4084
4085 /* Skip over open/close-group commands. */
4086 while (p2 + 2 < pend
4087 && ((re_opcode_t) *p2 == stop_memory
4088 || (re_opcode_t) *p2 == start_memory))
4089 p2 += 3; /* Skip over args, too. */
4090
4091 /* If we're at the end of the pattern, we can change. */
4092 if (p2 == pend)
4093 {
4094 /* Consider what happens when matching ":\(.*\)"
4095 against ":/". I don't really understand this code
4096 yet. */
4097 p[-3] = (unsigned char) pop_failure_jump;
4098 DEBUG_PRINT1
4099 (" End of pattern: change to `pop_failure_jump'.\n");
4100 }
4101
4102 else if ((re_opcode_t) *p2 == exactn
4103 || (bufp->newline_anchor && (re_opcode_t) *p2 == endline))
4104 {
4105 register unsigned char c
4106 = *p2 == (unsigned char) endline ? '\n' : p2[2];
4107 p1 = p + mcnt;
4108
4109 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
4110 to the `maybe_finalize_jump' of this case. Examine what
4111 follows. */
4112 if ((re_opcode_t) p1[3] == exactn && p1[5] != c)
4113 {
4114 p[-3] = (unsigned char) pop_failure_jump;
4115 DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n",
4116 c, p1[5]);
4117 }
4118
4119 else if ((re_opcode_t) p1[3] == charset
4120 || (re_opcode_t) p1[3] == charset_not)
4121 {
4122 int not = (re_opcode_t) p1[3] == charset_not;
4123
4124 if (c < (unsigned char) (p1[4] * BYTEWIDTH)
4125 && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
4126 not = !not;
4127
4128 /* `not' is equal to 1 if c would match, which means
4129 that we can't change to pop_failure_jump. */
4130 if (!not)
4131 {
4132 p[-3] = (unsigned char) pop_failure_jump;
4133 DEBUG_PRINT1 (" No match => pop_failure_jump.\n");
4134 }
4135 }
4136 }
4137 }
4138 p -= 2; /* Point at relative address again. */
4139 if ((re_opcode_t) p[-1] != pop_failure_jump)
4140 {
4141 p[-1] = (unsigned char) jump;
4142 DEBUG_PRINT1 (" Match => jump.\n");
4143 goto unconditional_jump;
4144 }
4145 /* Note fall through. */
4146
4147
4148 /* The end of a simple repeat has a pop_failure_jump back to
4149 its matching on_failure_jump, where the latter will push a
4150 failure point. The pop_failure_jump takes off failure
4151 points put on by this pop_failure_jump's matching
4152 on_failure_jump; we got through the pattern to here from the
4153 matching on_failure_jump, so didn't fail. */
4154 case pop_failure_jump:
4155 {
4156 /* We need to pass separate storage for the lowest and
4157 highest registers, even though we don't care about the
4158 actual values. Otherwise, we will restore only one
4159 register from the stack, since lowest will == highest in
4160 `pop_failure_point'. */
4161 unsigned dummy_low_reg, dummy_high_reg;
4162 unsigned char *pdummy;
4163 const char *sdummy;
4164
4165 DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n");
4166 POP_FAILURE_POINT (sdummy, pdummy,
4167 dummy_low_reg, dummy_high_reg,
4168 reg_dummy, reg_dummy, reg_info_dummy);
4169 }
4170 /* Note fall through. */
4171
4172
4173 /* Unconditionally jump (without popping any failure points). */
4174 case jump:
4175 unconditional_jump:
4176 EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */
4177 DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt);
4178 p += mcnt; /* Do the jump. */
4179 DEBUG_PRINT2 ("(to 0x%x).\n", p);
4180 break;
4181
4182
4183 /* We need this opcode so we can detect where alternatives end
4184 in `group_match_null_string_p' et al. */
4185 case jump_past_alt:
4186 DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n");
4187 goto unconditional_jump;
4188
4189
4190 /* Normally, the on_failure_jump pushes a failure point, which
4191 then gets popped at pop_failure_jump. We will end up at
4192 pop_failure_jump, also, and with a pattern of, say, `a+', we
4193 are skipping over the on_failure_jump, so we have to push
4194 something meaningless for pop_failure_jump to pop. */
4195 case dummy_failure_jump:
4196 DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n");
4197 /* It doesn't matter what we push for the string here. What
4198 the code at `fail' tests is the value for the pattern. */
4199 /*
4200 * WARN 191: illegal pointer conversion
4201 * between CONST and NON-CONST attribute
4202 */
4203# ifdef __TANDEM
4204# pragma NOWARN( 191 )
4205# endif
4206 PUSH_FAILURE_POINT (0, 0, -2);
4207# ifdef __TANDEM
4208# pragma WARN( 191 )
4209# endif
4210 goto unconditional_jump;
4211
4212
4213 /* At the end of an alternative, we need to push a dummy failure
4214 point in case we are followed by a `pop_failure_jump', because
4215 we don't want the failure point for the alternative to be
4216 popped. For example, matching `(a|ab)*' against `aab'
4217 requires that we match the `ab' alternative. */
4218 case push_dummy_failure:
4219 DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n");
4220 /* See comments just above at `dummy_failure_jump' about the
4221 two zeroes. */
4222 /*
4223 * WARN 191: illegal pointer conversion
4224 * between CONST and NON-CONST attribute
4225 */
4226# ifdef __TANDEM
4227# pragma NOWARN( 191 )
4228# endif
4229 PUSH_FAILURE_POINT (0, 0, -2);
4230# ifdef __TANDEM
4231# pragma WARN( 191 )
4232# endif
4233 break;
4234
4235 /* Have to succeed matching what follows at least n times.
4236 After that, handle like `on_failure_jump'. */
4237 case succeed_n:
4238 EXTRACT_NUMBER (mcnt, p + 2);
4239 DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt);
4240
4241 assert (mcnt >= 0);
4242 /* Originally, this is how many times we HAVE to succeed. */
4243 if (mcnt > 0)
4244 {
4245 mcnt--;
4246 p += 2;
4247 STORE_NUMBER_AND_INCR (p, mcnt);
4248 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt);
4249 }
4250 else if (mcnt == 0)
4251 {
4252 DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2);
4253 p[2] = (unsigned char) no_op;
4254 p[3] = (unsigned char) no_op;
4255 goto on_failure;
4256 }
4257 break;
4258
4259 case jump_n:
4260 EXTRACT_NUMBER (mcnt, p + 2);
4261 DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt);
4262
4263 /* Originally, this is how many times we CAN jump. */
4264 if (mcnt)
4265 {
4266 mcnt--;
4267 STORE_NUMBER (p + 2, mcnt);
4268 goto unconditional_jump;
4269 }
4270 /* If don't have to jump any more, skip over the rest of command. */
4271 else
4272 p += 4;
4273 break;
4274
4275 case set_number_at:
4276 {
4277 DEBUG_PRINT1 ("EXECUTING set_number_at.\n");
4278
4279 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4280 p1 = p + mcnt;
4281 EXTRACT_NUMBER_AND_INCR (mcnt, p);
4282 DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt);
4283 STORE_NUMBER (p1, mcnt);
4284 break;
4285 }
4286
4287 case wordbound:
4288 DEBUG_PRINT1 ("EXECUTING wordbound.\n");
4289 if (AT_WORD_BOUNDARY (d))
4290 break;
4291 goto fail;
4292
4293 case notwordbound:
4294 DEBUG_PRINT1 ("EXECUTING notwordbound.\n");
4295 if (AT_WORD_BOUNDARY (d))
4296 goto fail;
4297 break;
4298
4299 case wordbeg:
4300 DEBUG_PRINT1 ("EXECUTING wordbeg.\n");
4301 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
4302 break;
4303 goto fail;
4304
4305 case wordend:
4306 DEBUG_PRINT1 ("EXECUTING wordend.\n");
4307 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
4308 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
4309 break;
4310 goto fail;
4311
4312#ifdef emacs
4313#ifdef emacs19
4314 case before_dot:
4315 DEBUG_PRINT1 ("EXECUTING before_dot.\n");
4316 if (PTR_CHAR_POS ((unsigned char *) d) >= point)
4317 goto fail;
4318 break;
4319
4320 case at_dot:
4321 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4322 if (PTR_CHAR_POS ((unsigned char *) d) != point)
4323 goto fail;
4324 break;
4325
4326 case after_dot:
4327 DEBUG_PRINT1 ("EXECUTING after_dot.\n");
4328 if (PTR_CHAR_POS ((unsigned char *) d) <= point)
4329 goto fail;
4330 break;
4331#else /* not emacs19 */
4332 case at_dot:
4333 DEBUG_PRINT1 ("EXECUTING at_dot.\n");
4334 if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point)
4335 goto fail;
4336 break;
4337#endif /* not emacs19 */
4338
4339 case syntaxspec:
4340 DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt);
4341 mcnt = *p++;
4342 goto matchsyntax;
4343
4344 case wordchar:
4345 DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n");
4346 mcnt = (int) Sword;
4347 matchsyntax:
4348 PREFETCH ();
4349 if (SYNTAX (*d++) != (enum syntaxcode) mcnt)
4350 goto fail;
4351 SET_REGS_MATCHED ();
4352 break;
4353
4354 case notsyntaxspec:
4355 DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt);
4356 mcnt = *p++;
4357 goto matchnotsyntax;
4358
4359 case notwordchar:
4360 DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n");
4361 mcnt = (int) Sword;
4362 matchnotsyntax:
4363 PREFETCH ();
4364 if (SYNTAX (*d++) == (enum syntaxcode) mcnt)
4365 goto fail;
4366 SET_REGS_MATCHED ();
4367 break;
4368
4369#else /* not emacs */
4370 case wordchar:
4371 DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n");
4372 PREFETCH ();
4373 if (!WORDCHAR_P (d))
4374 goto fail;
4375 SET_REGS_MATCHED ();
4376 d++;
4377 break;
4378
4379 case notwordchar:
4380 DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n");
4381 PREFETCH ();
4382 if (WORDCHAR_P (d))
4383 goto fail;
4384 SET_REGS_MATCHED ();
4385 d++;
4386 break;
4387#endif /* not emacs */
4388
4389 default:
4390 abort ();
4391 }
4392 continue; /* Successfully executed one pattern command; keep going. */
4393
4394
4395 /* We goto here if a matching operation fails. */
4396 fail:
4397 if (!FAIL_STACK_EMPTY ())
4398 { /* A restart point is known. Restore to that state. */
4399 DEBUG_PRINT1 ("\nFAIL:\n");
4400 POP_FAILURE_POINT (d, p,
4401 lowest_active_reg, highest_active_reg,
4402 regstart, regend, reg_info);
4403
4404 /* If this failure point is a dummy, try the next one. */
4405 if (!p)
4406 goto fail;
4407
4408 /* If we failed to the end of the pattern, don't examine *p. */
4409 assert (p <= pend);
4410 if (p < pend)
4411 {
4412 boolean is_a_jump_n = false;
4413
4414 /* If failed to a backwards jump that's part of a repetition
4415 loop, need to pop this failure point and use the next one. */
4416 switch ((re_opcode_t) *p)
4417 {
4418 case jump_n:
4419 is_a_jump_n = true;
4420 case maybe_pop_jump:
4421 case pop_failure_jump:
4422 case jump:
4423 p1 = p + 1;
4424 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4425 p1 += mcnt;
4426
4427 if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
4428 || (!is_a_jump_n
4429 && (re_opcode_t) *p1 == on_failure_jump))
4430 goto fail;
4431 break;
4432 default:
4433 /* do nothing */ ;
4434 }
4435 }
4436
4437 if (d >= string1 && d <= end1)
4438 dend = end_match_1;
4439 }
4440 else
4441 break; /* Matching at this starting point really fails. */
4442 } /* for (;;) */
4443
4444 if (best_regs_set)
4445 goto restore_best_regs;
4446
4447 FREE_VARIABLES ();
4448
4449 return -1; /* Failure to match. */
4450} /* re_match_2 */
4451\f
4452/* Subroutine definitions for re_match_2. */
4453
4454
4455/* We are passed P pointing to a register number after a start_memory.
4456
4457 Return true if the pattern up to the corresponding stop_memory can
4458 match the empty string, and false otherwise.
4459
4460 If we find the matching stop_memory, sets P to point to one past its number.
4461 Otherwise, sets P to an undefined byte less than or equal to END.
4462
4463 We don't handle duplicates properly (yet). */
4464
4465static boolean
4466group_match_null_string_p (p, end, reg_info)
4467 unsigned char **p, *end;
4468 register_info_type *reg_info;
4469{
4470 int mcnt;
4471 /* Point to after the args to the start_memory. */
4472 unsigned char *p1 = *p + 2;
4473
4474 while (p1 < end)
4475 {
4476 /* Skip over opcodes that can match nothing, and return true or
4477 false, as appropriate, when we get to one that can't, or to the
4478 matching stop_memory. */
4479
4480 switch ((re_opcode_t) *p1)
4481 {
4482 /* Could be either a loop or a series of alternatives. */
4483 case on_failure_jump:
4484 p1++;
4485 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4486
4487 /* If the next operation is not a jump backwards in the
4488 pattern. */
4489
4490 if (mcnt >= 0)
4491 {
4492 /* Go through the on_failure_jumps of the alternatives,
4493 seeing if any of the alternatives cannot match nothing.
4494 The last alternative starts with only a jump,
4495 whereas the rest start with on_failure_jump and end
4496 with a jump, e.g., here is the pattern for `a|b|c':
4497
4498 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
4499 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
4500 /exactn/1/c
4501
4502 So, we have to first go through the first (n-1)
4503 alternatives and then deal with the last one separately. */
4504
4505
4506 /* Deal with the first (n-1) alternatives, which start
4507 with an on_failure_jump (see above) that jumps to right
4508 past a jump_past_alt. */
4509
4510 while ((re_opcode_t) p1[mcnt-3] == jump_past_alt)
4511 {
4512 /* `mcnt' holds how many bytes long the alternative
4513 is, including the ending `jump_past_alt' and
4514 its number. */
4515
4516 if (!alt_match_null_string_p (p1, p1 + mcnt - 3,
4517 reg_info))
4518 return false;
4519
4520 /* Move to right after this alternative, including the
4521 jump_past_alt. */
4522 p1 += mcnt;
4523
4524 /* Break if it's the beginning of an n-th alternative
4525 that doesn't begin with an on_failure_jump. */
4526 if ((re_opcode_t) *p1 != on_failure_jump)
4527 break;
4528
4529 /* Still have to check that it's not an n-th
4530 alternative that starts with an on_failure_jump. */
4531 p1++;
4532 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4533 if ((re_opcode_t) p1[mcnt-3] != jump_past_alt)
4534 {
4535 /* Get to the beginning of the n-th alternative. */
4536 p1 -= 3;
4537 break;
4538 }
4539 }
4540
4541 /* Deal with the last alternative: go back and get number
4542 of the `jump_past_alt' just before it. `mcnt' contains
4543 the length of the alternative. */
4544 EXTRACT_NUMBER (mcnt, p1 - 2);
4545
4546 if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
4547 return false;
4548
4549 p1 += mcnt; /* Get past the n-th alternative. */
4550 } /* if mcnt > 0 */
4551 break;
4552
4553
4554 case stop_memory:
4555 assert (p1[1] == **p);
4556 *p = p1 + 2;
4557 return true;
4558
4559
4560 default:
4561 if (!common_op_match_null_string_p (&p1, end, reg_info))
4562 return false;
4563 }
4564 } /* while p1 < end */
4565
4566 return false;
4567} /* group_match_null_string_p */
4568
4569
4570/* Similar to group_match_null_string_p, but doesn't deal with alternatives:
4571 It expects P to be the first byte of a single alternative and END one
4572 byte past the last. The alternative can contain groups. */
4573
4574static boolean
4575alt_match_null_string_p (p, end, reg_info)
4576 unsigned char *p, *end;
4577 register_info_type *reg_info;
4578{
4579 int mcnt;
4580 unsigned char *p1 = p;
4581
4582 while (p1 < end)
4583 {
4584 /* Skip over opcodes that can match nothing, and break when we get
4585 to one that can't. */
4586
4587 switch ((re_opcode_t) *p1)
4588 {
4589 /* It's a loop. */
4590 case on_failure_jump:
4591 p1++;
4592 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4593 p1 += mcnt;
4594 break;
4595
4596 default:
4597 if (!common_op_match_null_string_p (&p1, end, reg_info))
4598 return false;
4599 }
4600 } /* while p1 < end */
4601
4602 return true;
4603} /* alt_match_null_string_p */
4604
4605
4606/* Deals with the ops common to group_match_null_string_p and
4607 alt_match_null_string_p.
4608
4609 Sets P to one after the op and its arguments, if any. */
4610
4611static boolean
4612common_op_match_null_string_p (p, end, reg_info)
4613 unsigned char **p, *end;
4614 register_info_type *reg_info;
4615{
4616 int mcnt;
4617 boolean ret;
4618 int reg_no;
4619 unsigned char *p1 = *p;
4620
4621 switch ((re_opcode_t) *p1++)
4622 {
4623 case no_op:
4624 case begline:
4625 case endline:
4626 case begbuf:
4627 case endbuf:
4628 case wordbeg:
4629 case wordend:
4630 case wordbound:
4631 case notwordbound:
4632#ifdef emacs
4633 case before_dot:
4634 case at_dot:
4635 case after_dot:
4636#endif
4637 break;
4638
4639 case start_memory:
4640 reg_no = *p1;
4641 assert (reg_no > 0 && reg_no <= MAX_REGNUM);
4642 ret = group_match_null_string_p (&p1, end, reg_info);
4643
4644 /* Have to set this here in case we're checking a group which
4645 contains a group and a back reference to it. */
4646
4647 if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
4648 REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret;
4649
4650 if (!ret)
4651 return false;
4652 break;
4653
4654 /* If this is an optimized succeed_n for zero times, make the jump. */
4655 case jump:
4656 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4657 if (mcnt >= 0)
4658 p1 += mcnt;
4659 else
4660 return false;
4661 break;
4662
4663 case succeed_n:
4664 /* Get to the number of times to succeed. */
4665 p1 += 2;
4666 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4667
4668 if (mcnt == 0)
4669 {
4670 p1 -= 4;
4671 EXTRACT_NUMBER_AND_INCR (mcnt, p1);
4672 p1 += mcnt;
4673 }
4674 else
4675 return false;
4676 break;
4677
4678 case duplicate:
4679 if (!REG_MATCH_NULL_STRING_P (reg_info[*p1]))
4680 return false;
4681 break;
4682
4683 case set_number_at:
4684 p1 += 4;
4685
4686 default:
4687 /* All other opcodes mean we cannot match the empty string. */
4688 return false;
4689 }
4690
4691 *p = p1;
4692 return true;
4693} /* common_op_match_null_string_p */
4694
4695
4696/* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
4697 bytes; nonzero otherwise. */
4698
4699static int
4700bcmp_translate (s1, s2, len, translate)
4701 unsigned char *s1, *s2;
4702 register int len;
4703 char *translate;
4704{
4705 register unsigned char *p1 = s1, *p2 = s2;
4706 while (len)
4707 {
4708 if (translate[*p1++] != translate[*p2++]) return 1;
4709 len--;
4710 }
4711 return 0;
4712}
4713\f
4714/* Entry points for GNU code. */
4715
4716/* re_compile_pattern is the GNU regular expression compiler: it
4717 compiles PATTERN (of length SIZE) and puts the result in BUFP.
4718 Returns 0 if the pattern was valid, otherwise an error string.
4719
4720 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
4721 are set in BUFP on entry.
4722
4723 We call regex_compile to do the actual compilation. */
4724
4725const char *
4726re_compile_pattern (pattern, length, bufp)
4727 const char *pattern;
4728 int length;
4729 struct re_pattern_buffer *bufp;
4730{
4731 reg_errcode_t ret;
4732
4733 /* GNU code is written to assume at least RE_NREGS registers will be set
4734 (and at least one extra will be -1). */
4735 bufp->regs_allocated = REGS_UNALLOCATED;
4736
4737 /* And GNU code determines whether or not to get register information
4738 by passing null for the REGS argument to re_match, etc., not by
4739 setting no_sub. */
4740 bufp->no_sub = 0;
4741
4742 /* Match anchors at newline. */
4743 bufp->newline_anchor = 1;
4744
4745 ret = regex_compile (pattern, length, re_syntax_options, bufp);
4746
4747 return (ret == 0) ? (const char*)NULL : re_error_msg[(int) ret];
4748}
4749\f
4750/* Entry points compatible with 4.2 BSD regex library. We don't define
4751 them if this is an Emacs or POSIX compilation. */
4752
4753#if !defined (emacs) && !defined (_POSIX_SOURCE)
4754
4755/* BSD has one and only one pattern buffer. */
4756static struct re_pattern_buffer re_comp_buf;
4757
4758char *
4759re_comp (s)
4760 const char *s;
4761{
4762 reg_errcode_t ret;
4763
4764 if (!s)
4765 {
4766 if (!re_comp_buf.buffer)
4767 return "No previous regular expression";
4768 return 0;
4769 }
4770
4771 if (!re_comp_buf.buffer)
4772 {
4773 re_comp_buf.buffer = (unsigned char *) malloc (200);
4774 if (re_comp_buf.buffer == NULL)
4775 return "Memory exhausted";
4776 re_comp_buf.allocated = 200;
4777
4778 re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH);
4779 if (re_comp_buf.fastmap == NULL)
4780 return "Memory exhausted";
4781 }
4782
4783 /* Since `re_exec' always passes NULL for the `regs' argument, we
4784 don't need to initialize the pattern buffer fields which affect it. */
4785
4786 /* Match anchors at newlines. */
4787 re_comp_buf.newline_anchor = 1;
4788
4789 ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf);
4790
4791 /* Yes, we're discarding `const' here. */
4792 return (char *) re_error_msg[(int) ret];
4793}
4794
4795
4796int
4797re_exec (s)
4798 const char *s;
4799{
4800 const int len = strlen (s);
4801 return
4802 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0);
4803}
4804#endif /* not emacs and not _POSIX_SOURCE */
4805\f
4806/* POSIX.2 functions. Don't define these for Emacs. */
4807
4808#ifndef emacs
4809
4810/* regcomp takes a regular expression as a string and compiles it.
4811
4812 PREG is a regex_t *. We do not expect any fields to be initialized,
4813 since POSIX says we shouldn't. Thus, we set
4814
4815 `buffer' to the compiled pattern;
4816 `used' to the length of the compiled pattern;
4817 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
4818 REG_EXTENDED bit in CFLAGS is set; otherwise, to
4819 RE_SYNTAX_POSIX_BASIC;
4820 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
4821 `fastmap' and `fastmap_accurate' to zero;
4822 `re_nsub' to the number of subexpressions in PATTERN.
4823
4824 PATTERN is the address of the pattern string.
4825
4826 CFLAGS is a series of bits which affect compilation.
4827
4828 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
4829 use POSIX basic syntax.
4830
4831 If REG_NEWLINE is set, then . and [^...] don't match newline.
4832 Also, regexec will try a match beginning after every newline.
4833
4834 If REG_ICASE is set, then we considers upper- and lowercase
4835 versions of letters to be equivalent when matching.
4836
4837 If REG_NOSUB is set, then when PREG is passed to regexec, that
4838 routine will report only success or failure, and nothing about the
4839 registers.
4840
4841 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
4842 the return codes and their meanings.) */
4843
4844int
4845regcomp (preg, pattern, cflags)
4846 regex_t *preg;
4847 const char *pattern;
4848 int cflags;
4849{
4850 reg_errcode_t ret;
4851 unsigned syntax
4852 = (cflags & REG_EXTENDED) ?
4853 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
4854
4855 /* regex_compile will allocate the space for the compiled pattern. */
4856 preg->buffer = 0;
4857 preg->allocated = 0;
4858
4859 /* Don't bother to use a fastmap when searching. This simplifies the
4860 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
4861 characters after newlines into the fastmap. This way, we just try
4862 every character. */
4863 preg->fastmap = 0;
4864
4865 if (cflags & REG_ICASE)
4866 {
4867 unsigned i;
4868
4869 preg->translate = (char *) malloc (CHAR_SET_SIZE);
4870 if (preg->translate == NULL)
4871 return (int) REG_ESPACE;
4872
4873 /* Map uppercase characters to corresponding lowercase ones. */
4874 for (i = 0; i < CHAR_SET_SIZE; i++)
4875 preg->translate[i] = (unsigned char)(ISUPPER (i) ? tolower (i) : i);
4876 }
4877 else
4878 preg->translate = NULL;
4879
4880 /* If REG_NEWLINE is set, newlines are treated differently. */
4881 if (cflags & REG_NEWLINE)
4882 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
4883 syntax &= ~RE_DOT_NEWLINE;
4884 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
4885 /* It also changes the matching behavior. */
4886 preg->newline_anchor = 1;
4887 }
4888 else
4889 preg->newline_anchor = 0;
4890
4891 preg->no_sub = !!(cflags & REG_NOSUB);
4892
4893 /* POSIX says a null character in the pattern terminates it, so we
4894 can use strlen here in compiling the pattern. */
4895 ret = regex_compile (pattern, strlen (pattern), syntax, preg);
4896
4897 /* POSIX doesn't distinguish between an unmatched open-group and an
4898 unmatched close-group: both are REG_EPAREN. */
4899 if (ret == REG_ERPAREN) ret = REG_EPAREN;
4900
4901 return (int) ret;
4902}
4903
4904
4905/* regexec searches for a given pattern, specified by PREG, in the
4906 string STRING.
4907
4908 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
4909 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
4910 least NMATCH elements, and we set them to the offsets of the
4911 corresponding matched substrings.
4912
4913 EFLAGS specifies `execution flags' which affect matching: if
4914 REG_NOTBOL is set, then ^ does not match at the beginning of the
4915 string; if REG_NOTEOL is set, then $ does not match at the end.
4916
4917 We return 0 if we find a match and REG_NOMATCH if not. */
4918
4919int
4920regexec (preg, string, nmatch, pmatch, eflags)
4921 const regex_t *preg;
4922 const char *string;
4923 size_t nmatch;
4924 regmatch_t pmatch[];
4925 int eflags;
4926{
4927 int ret;
4928 struct re_registers regs;
4929 regex_t private_preg;
4930 int len = strlen (string);
4931 boolean want_reg_info = !preg->no_sub && nmatch > 0;
4932
4933 private_preg = *preg;
4934
4935 private_preg.not_bol = !!(eflags & REG_NOTBOL);
4936 private_preg.not_eol = !!(eflags & REG_NOTEOL);
4937
4938 /* The user has told us exactly how many registers to return
4939 information about, via `nmatch'. We have to pass that on to the
4940 matching routines. */
4941 private_preg.regs_allocated = REGS_FIXED;
4942
4943 if (want_reg_info)
4944 {
4945 regs.num_regs = nmatch;
4946 regs.start = TALLOC (nmatch, regoff_t);
4947 regs.end = TALLOC (nmatch, regoff_t);
4948 if (regs.start == NULL || regs.end == NULL)
4949 return (int) REG_NOMATCH;
4950 }
4951
4952 /* Perform the searching operation. */
4953 ret = re_search (&private_preg, string, len,
4954 /* start: */ 0, /* range: */ len,
4955 want_reg_info ? &regs : (struct re_registers *) 0);
4956
4957 /* Copy the register information to the POSIX structure. */
4958 if (want_reg_info)
4959 {
4960 if (ret >= 0)
4961 {
4962 unsigned r;
4963
4964 for (r = 0; r < nmatch; r++)
4965 {
4966 pmatch[r].rm_so = regs.start[r];
4967 pmatch[r].rm_eo = regs.end[r];
4968 }
4969 }
4970
4971 /* If we needed the temporary register info, free the space now. */
4972 free (regs.start);
4973 free (regs.end);
4974 }
4975
4976 /* We want zero return to mean success, unlike `re_search'. */
4977 return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH;
4978}
4979
4980
4981/* Returns a message corresponding to an error code, ERRCODE, returned
4982 from either regcomp or regexec. We don't use PREG here. */
4983
4984size_t
4985regerror (errcode, preg, errbuf, errbuf_size)
4986 int errcode;
4987 const regex_t *preg;
4988 char *errbuf;
4989 size_t errbuf_size;
4990{
4991# define _RERR_(n,t) sizeof( t ),
4992 static const size_t aSizes[] = { REG_ERR_TABLE };
4993# undef _RERR_
4994 const char* msg;
4995 size_t msg_size;
4996
4997 if ((unsigned)errcode >= (int)REG_ERR_COUNT)
4998 /* Only error codes returned by the rest of the code should be passed
4999 to this routine. If we are given anything else, or if other regex
5000 code generates an invalid error code, then the program has a bug.
5001 Dump core so we can fix it. */
5002 abort ();
5003
5004 msg = re_error_msg[ errcode ];
5005 msg_size = aSizes[ errcode ];
5006
5007 if (msg_size >= errbuf_size)
5008 {
5009 strncpy (errbuf, msg, errbuf_size - 1);
5010 errbuf[errbuf_size - 1] = 0;
5011 }
5012 else
5013 strcpy (errbuf, msg);
5014
5015 return msg_size;
5016#ifdef __TANDEM
5017/*
5018 * WARN 93: no reference to identifier "preg"
5019 */
5020#pragma NOWARN( 93 )
5021#endif
5022}
5023#ifdef __TANDEM
5024#pragma WARN( 93 )
5025#endif
5026
5027
5028/* Free dynamically allocated space used by PREG. */
5029
5030void
5031regfree (preg)
5032 regex_t *preg;
5033{
5034 if (preg->buffer != NULL)
5035 free (preg->buffer);
5036 preg->buffer = NULL;
5037
5038 preg->allocated = 0;
5039 preg->used = 0;
5040
5041 if (preg->fastmap != NULL)
5042 free (preg->fastmap);
5043 preg->fastmap = NULL;
5044 preg->fastmap_accurate = 0;
5045
5046 if (preg->translate != NULL)
5047 free (preg->translate);
5048 preg->translate = NULL;
5049}
5050
5051#endif /* not emacs */
5052\f
5053/*
5054Local variables:
5055make-backup-files: t
5056version-control: t
5057trim-versions-without-asking: nil
5058End:
5059*/
This page took 0.528221 seconds and 5 git commands to generate.