# This is patch 4 to gawk 2.15. # Change to the gawk 2.15.3 distribution directory, and sh this file. # Running it through patch is not enough; several files and directories # must be removed first. # # Make sure that the patch program is in your search path. rm -fr awktab.c regex.h regex.c dfa.h dfa.c patch -p1 << \EOF diff -crN gawk-2.15.3/INSTALL gawk-2.15.4/INSTALL *** gawk-2.15.3/INSTALL Wed Dec 31 19:00:00 1969 --- gawk-2.15.4/INSTALL Wed Dec 29 10:47:19 1993 *************** *** 0 **** --- 1,32 ---- + December, 1993 - arnold@skeeve.atl.ga.us + + These are minimal instructions for installing gawk on a Unix system. + Full instructions for installing and porting gawk are given in Chapter 16 + of the manual. + + 1. Look in the `config' directory for a file that is appropriate to your + Unix system. For example, use `ultrix41' for Ultrix 4.1 or later, and + `sunos41' for SunOS 4.1.x. + + 2. Type + ./configure FILE + where FILE is the name you chose earlier. + + 3. Type + make + This should build gawk. + + 4. Type + make -n install + to see where things will be installed by default. Edit the Makefile to + change these defaults if they are not appropriate to your system. Then + type + make install + to install gawk. + + If you don't have GCC, or if your Unix version is not close enough to one + of the ones in the `config' directory, then you will need to do more work; + see the manual. + + The next major release of gawk will use GNU Autoconf; the installation + instructions will change then. diff -crN gawk-2.15.3/Makefile.dec gawk-2.15.4/Makefile.dec *** gawk-2.15.3/Makefile.dec Wed Dec 31 19:00:00 1969 --- gawk-2.15.4/Makefile.dec Wed Dec 29 10:47:59 1993 *************** *** 0 **** --- 1,269 ---- + # Makefile for GNU Awk. + # + # Copyright (C) 1986, 1988-1993 the Free Software Foundation, Inc. + # + # This file is part of GAWK, the GNU implementation of the + # AWK Progamming Language. + # + # GAWK is free software; you can redistribute it and/or modify + # it under the terms of the GNU General Public License as published by + # the Free Software Foundation; either version 2 of the License, or + # (at your option) any later version. + # + # GAWK is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + # GNU General Public License for more details. + # + # You should have received a copy of the GNU General Public License + # along with GAWK; see the file COPYING. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + # User tunable macros -- CHANGE THESE IN Makefile.in RATHER THAN IN + # Makefile, OR configure WILL OVERWRITE YOUR CHANGES + + prefix = /usr/local + exec_prefix = $(prefix) + binprefix = + manprefix = + + bindir = $(exec_prefix)/bin + libdir = $(exec_prefix)/lib + mandir = $(prefix)/man/man1 + manext = .1 + infodir = $(prefix)/info + + # The provided "configure" is used to turn a config file (samples in + # the "config" directory into commands to edit config.in into + # a suitable config.h and to edit Makefile.in into Makefile. + # To port GAWK, create an appropriate config file using the ones in + # the config directory as examples and using the comments in config.in + # as a guide. + + CC= cc + ##MAKE_CC## CC = cc + + PROFILE= #-pg + DEBUG= #-DMALLOCDEBUG #-DDEBUG #-DFUNC_TRACE #-DMPROF + LINKSTATIC= #-Bstatic + WARN= #-W -Wunused -Wimplicit -Wreturn-type -Wcomment # for gcc only + + # Parser to use on grammar - any one of the following will work + PARSER = bison -y + #PARSER = yacc + #PARSER = byacc + + # Set LIBS to any libraries that are machine specific + LIBS = + + # Cray 2 running Unicos 5.0.7 + ##MAKE_LIBNET## LIBS = -lnet + + + # Systems with alloca in /lib/libPW.a + ##MAKE_ALLOCA_PW## LIBS = -lPW + + # ALLOCA - only needed if you use bison + # Set equal to alloca.o if your system is S5 and you don't have + # alloca. Uncomment one of the rules below to make alloca.o from + # either alloca.s or alloca.c. + # This should have already been done automatically by configure. + # + # Some systems have alloca in libPW.a, so LIBS=-lPW may work, too. + ##MAKE_ALLOCA_C## ALLOCA= alloca.o + ##MAKE_ALLOCA_S## ALLOCA= alloca.o + + VFLAGS= + + # VMS POSIX, VAXC V3.2 + ##MAKE_VMS-Posix## VFLAGS = -UVMS -D__STDC__=0 + + # HP/Apollo running cc version 6.7 or earlier + ##MAKE_Apollo## VFLAGS = -U__STDC__ -A run,sys5.3 + ##MAKE_Apollo## LIBS = -A sys,any + + # SGI IRIX 4.0.5 cc flags + ##MAKE_SGI## VFLAGS = -cckr + + ##MAKE_NeXT## VFLAGS = -DGFMT_WORKAROUND + + CFLAGS = -Olimit 1500 + FLAGS = -DGAWK -DHAVE_CONFIG_H $(VFLAGS) $(DEBUG) $(PROFILE) $(WARN) -Dconst="" + LDFLAGS = $(LINKSTATIC) $(PROFILE) + + .c.o: + $(CC) $(CFLAGS) $(FLAGS) -c $< + + # object files + AWKOBJS = main.o eval.o builtin.o msg.o iop.o io.o field.o array.o \ + node.o version.o missing.o re.o getopt.o getopt1.o + + ALLOBJS = $(AWKOBJS) awktab.o + + # GNUOBJS + # GNU stuff that gawk uses as library routines. + GNUOBJS= regex.o dfa.o $(ALLOCA) + + # source and documentation files + SRC = main.c eval.c builtin.c msg.c version.c \ + iop.c io.c field.c array.c node.c missing.c re.c getopt.c getopt1.c + + ALLSRC= $(SRC) awktab.c + + AWKSRC= awk.h awk.y $(ALLSRC) patchlevel.h protos.h config.in getopt.h + + GNUSRC = alloca.c alloca.s dfa.c dfa.h regex.c regex.h + + COPIES = missing/system.c missing/tzset.c \ + missing/memcmp.c missing/memcpy.c missing/memset.c \ + missing/random.c missing/strncasecmp.c missing/strchr.c \ + missing/strerror.c missing/strtod.c \ + missing/strftime.c missing/strftime.3 + + SUPPORT = support/texindex.c support/texinfo.tex + + DOCS= gawk.1 gawk.texi + + TEXFILES= gawk.aux gawk.cp gawk.cps gawk.fn gawk.fns gawk.ky gawk.kys \ + gawk.pg gawk.pgs gawk.toc gawk.tp gawk.tps gawk.vr gawk.vrs + + MISC = NEWS COPYING FUTURES Makefile.* PROBLEMS README* PORTS POSIX \ + mungeconf configure ACKNOWLEDGMENT LIMITATIONS + + OTHERS= pc/* atari/* vms/* + + ALLDOC= gawk.dvi $(TEXFILES) gawk.info* + + # Release of gawk. There can be no leading or trailing white space here! + REL=2.15 + + # rules to build gawk + gawk: $(ALLOBJS) $(GNUOBJS) $(REOBJS) + $(CC) -o gawk $(LDFLAGS) $(ALLOBJS) $(GNUOBJS) $(REOBJS) -lm $(LIBS) + + regex.o: regex.h awk.h + $(CC) $(FLAGS) -c regex.c + + $(AWKOBJS) regex.o dfa.o: awk.h dfa.h regex.h + + getopt.o: getopt.h + + getopt1.o: getopt.h + + main.o: patchlevel.h + + awktab.c: awk.y + $(PARSER) -v awk.y + ##MAKE_VMS-Posix## mv ytab.c awktab.c + ##MAKE_VMS-Posix## dummy.awk_tab.target: + sed '/^extern char .malloc(), .realloc();$$/d' y.tab.c >awktab.c + rm y.tab.c + + awktab.o: awk.h + + config.h: config.in + @echo You must provide a config.h! + @echo Run \"./configure\" to build it for known systems + @echo or copy config.in to config.h and edit it.; exit 1 + + install: gawk gawk.info + cp gawk $(bindir) && chmod 755 $(bindir)/gawk + cp gawk.1 $(mandir)/gawk$(manext) && chmod 644 $(mandir)/gawk$(manext) + cp gawk.info* $(infodir) && chmod 644 $(infodir)/gawk.info* + + uninstall: + rm -f $(bindir)/gawk $(mandir)/gawk$(manext) $(infodir)/gawk.info* + + # ALLOCA: uncomment this if your system (notably System V boxen) + # does not have alloca in /lib/libc.a or /lib/libPW.a + # + # If your machine is not supported by the assembly version of alloca.s, + # use the C version which follows instead. It uses the default rules to + # make alloca.o. + # + # One of these rules should have already been selected by running configure. + + + ##MAKE_ALLOCA_S## alloca.o: alloca.s + ##MAKE_ALLOCA_S## /lib/cpp < alloca.s | sed '/^#/d' > t.s + ##MAKE_ALLOCA_S## as t.s -o alloca.o + ##MAKE_ALLOCA_S## rm t.s + + ##MAKE_ALLOCA_C## alloca.o: alloca.c + + # auxiliary rules for release maintenance + lint: $(ALLSRC) + lint -hcbax $(FLAGS) $(ALLSRC) + + xref: + cxref -c $(FLAGS) $(ALLSRC) | grep -v ' /' >xref + + clean: + rm -rf gawk *.o core + cd test && make clean + + distclean: clean + rm -f Makefile *.orig *.rej */*.orig */*.rej awk.output gmon.out \ + make.out y.output config.h + + mostlyclean: clean + + realclean: distclean + rm -f awktab.c $(ALLDOC) + + cleaner: clean + rm -f gawk awktab.c Makefile config.h + + clobber: clean + rm -f $(ALLDOC) gawk.log config.h + + gawk.dvi: gawk.texi + cp support/texinfo.tex . + tex gawk.texi; texindex gawk.?? + tex gawk.texi; texindex gawk.?? + tex gawk.texi + rm -f texinfo.tex + + gawk.info: gawk.texi + makeinfo gawk.texi + + dist: $(AWKSRC) $(GNUSRC) $(DOCS) $(MISC) $(COPIES) $(SUPPORT) distclean + -rm -rf gawk-$(REL)* + dir=gawk-$(REL).`gawk '{print $$3}' patchlevel.h` && \ + mkdir $$dir && \ + cp -p $(AWKSRC) $(GNUSRC) $(DOCS) $(MISC) $$dir && \ + mkdir $$dir/missing && cp -p $(COPIES) $$dir/missing && \ + mkdir $$dir/atari && cp -p atari/* $$dir/atari && \ + mkdir $$dir/pc && cp -p pc/* $$dir/pc && \ + mkdir $$dir/vms && cp -p vms/* $$dir/vms && \ + mkdir $$dir/config && cp -p config/* $$dir/config && \ + mkdir $$dir/support && cp -p support/* $$dir/support && \ + cp -pr test $$dir && \ + chmod -R a+r $$dir && \ + chmod -R a-w $$dir && \ + find $$dir -type d -exec chmod 755 {} ';' && \ + find $$dir -print | doschk && \ + tar -cf - $$dir | gzip > $$dir.tar.gz && \ + rm -fr $$dir + + gawk-doc-$(REL).tar.gz: gawk.info gawk.dvi gawk.1 + -rm -rf gawk-doc-$(REL) gawk-doc-$(REL).tar.gz + -mkdir gawk-doc-$(REL) + cp -p $(ALLDOC) gawk-doc-$(REL) + groff -Tascii -man gawk.1 > gawk-doc-$(REL)/gawk.1.pr + tar -cf - gawk-doc-$(REL) | gzip > gawk-doc-$(REL).tar.gz + + gawk-ps-$(REL).tar.gz: gawk.dvi gawk.1 + -rm -rf gawk-ps-$(REL) gawk-ps-$(REL).tar.gz + -mkdir gawk-ps-$(REL) + dvips -o gawk-ps-$(REL)/gawk.postscript gawk.dvi + groff -man gawk.1 > gawk-ps-$(REL)/gawk.1.ps + tar -cf - gawk-ps-$(REL) | gzip > gawk-ps-$(REL).tar.gz + + release: dist gawk-doc-$(REL).tar.gz gawk-ps-$(REL).tar.gz + + test: gawk + cd test; make -k + + check: test + diff -crN gawk-2.15.3/Makefile.in gawk-2.15.4/Makefile.in *** gawk-2.15.3/Makefile.in Sun Nov 7 11:51:38 1993 --- gawk-2.15.4/Makefile.in Wed Dec 29 10:25:34 1993 *************** *** 1,6 **** # Makefile for GNU Awk. # ! # Copyright (C) 1986, 1988-1992 the Free Software Foundation, Inc. # # This file is part of GAWK, the GNU implementation of the # AWK Progamming Language. --- 1,6 ---- # Makefile for GNU Awk. # ! # Copyright (C) 1986, 1988-1993 the Free Software Foundation, Inc. # # This file is part of GAWK, the GNU implementation of the # AWK Progamming Language. *************** *** 40,46 **** # the config directory as examples and using the comments in config.in # as a guide. ! CC= gcc ##MAKE_CC## CC = cc PROFILE= #-pg --- 40,46 ---- # the config directory as examples and using the comments in config.in # as a guide. ! CC= gcc -g ##MAKE_CC## CC = cc PROFILE= #-pg *************** *** 87,93 **** ##MAKE_NeXT## VFLAGS = -DGFMT_WORKAROUND ! CFLAGS = -g -O FLAGS = -DGAWK -DHAVE_CONFIG_H $(VFLAGS) $(DEBUG) $(PROFILE) $(WARN) LDFLAGS = $(LINKSTATIC) $(PROFILE) --- 87,93 ---- ##MAKE_NeXT## VFLAGS = -DGFMT_WORKAROUND ! CFLAGS = -O FLAGS = -DGAWK -DHAVE_CONFIG_H $(VFLAGS) $(DEBUG) $(PROFILE) $(WARN) LDFLAGS = $(LINKSTATIC) $(PROFILE) *************** *** 128,134 **** gawk.pg gawk.pgs gawk.toc gawk.tp gawk.tps gawk.vr gawk.vrs MISC = NEWS COPYING FUTURES Makefile.* PROBLEMS README* PORTS POSIX \ ! mungeconf configure ACKNOWLEDGMENT LIMITATIONS OTHERS= pc/* atari/* vms/* --- 128,134 ---- gawk.pg gawk.pgs gawk.toc gawk.tp gawk.tps gawk.vr gawk.vrs MISC = NEWS COPYING FUTURES Makefile.* PROBLEMS README* PORTS POSIX \ ! mungeconf configure ACKNOWLEDGMENT LIMITATIONS INSTALL OTHERS= pc/* atari/* vms/* diff -crN gawk-2.15.3/NEWS gawk-2.15.4/NEWS *** gawk-2.15.3/NEWS Sun Nov 7 11:44:14 1993 --- gawk-2.15.4/NEWS Sat Jan 15 22:37:42 1994 *************** *** 1,3 **** --- 1,53 ---- + Changes from 2.15.3 to 2.15.4 + ----------------------------- + + Lots of lint fixes, and do_sprintf made mostly ANSI C compatible. + + Man page updated and edited. + + Copyrights updated. + + Arrays now grow dynamically, initially scaling up by an order of magnitude + and then doubling, up to ~ 64K. This should keep gawk's performance + graceful under heavy load. + + New `delete array' feature added. Only documented in the man page. + + Switched to dfa and regex suites from grep-2.0. These offer the ability to + move to POSIX regexps in the next release. + + Disabled GNU regex ops. + + Research awk -m option now recognized. It does nothing in gawk, since gawk + has no static limits. Only documented in the man page. + + New bionic (faster, better, stronger than before) hashing function. + + Bug fix in argument handling. `gawk -X' now notices there was no program. + Additional bug fixes to make --compat and --lint work again. + + Many changes for 16-bit cleanliness. + + Add explicit alloca(0) in io.c to recover space from C alloca. + + Fixed file descriptor leak in io.c. + + The --version option now follows the GNU coding standards and exits. + + Fixed several prototypes in protos.h. + + Several tests updated. On Solaris, warn that the out? tests will fail. + + Configuration files for SunOS with cc and Solaris 2.x added. + + Improved error messages in awk.y on gawk extensions if do_unix or do_compat. + + INSTALL file added. + + Fixed Atari Makefile and several VMS specific changes. + + Better conversion of numbers to strings on systems with broken sprintfs. + Changes from 2.15.2 to 2.15.3 ----------------------------- diff -crN gawk-2.15.3/PROBLEMS gawk-2.15.4/PROBLEMS *** gawk-2.15.3/PROBLEMS Wed May 5 19:50:35 1993 --- gawk-2.15.4/PROBLEMS Tue Jan 4 16:58:10 1994 *************** *** 3,6 **** Please keep in mind that the code is still undergoing significant evolution. ! 1. Gawk's printf is probably still not POSIX compliant. --- 3,10 ---- Please keep in mind that the code is still undergoing significant evolution. ! 1. The interactions with the lexer and yyerror need reworking. It is possible ! to get line numbers that are one line off if --compat or --posix is ! true and either `next file' or `delete array' are used. ! ! Really the whole lexical analysis stuff needs reworking. diff -crN gawk-2.15.3/README gawk-2.15.4/README *** gawk-2.15.3/README Fri Oct 22 06:04:15 1993 --- gawk-2.15.4/README Wed Dec 29 10:54:41 1993 *************** *** 10,16 **** Known problems are given in the PROBLEMS file. Work to be done is described briefly in the FUTURES file. Verified ports are listed in ! the PORTS file. Changes in this version are summarized in the CHANGES file. Please read the LIMITATIONS and ACKNOWLEDGMENT files. Read the file POSIX for a discussion of how the standard says comparisons --- 10,16 ---- Known problems are given in the PROBLEMS file. Work to be done is described briefly in the FUTURES file. Verified ports are listed in ! the PORTS file. Changes in this version are summarized in the NEWS file. Please read the LIMITATIONS and ACKNOWLEDGMENT files. Read the file POSIX for a discussion of how the standard says comparisons *************** *** 28,33 **** --- 28,35 ---- Check whether there is a system-specific README file for your system. + A quick overview of the installation process is in the file INSTALLATION. + Makefile.in may need some tailoring. The only changes necessary should be to change installation targets or to change compiler flags. The changes to make in Makefile.in are commented and should be obvious. *************** *** 69,75 **** PRINTING THE MANUAL ! The 'support' directory contains texinfo.tex 2.65, which will be necessary for printing the manual, and the texindex.c program from the texinfo distribution which is also necessary. See the makefile for the steps needed to get a DVI file from the manual. --- 71,77 ---- PRINTING THE MANUAL ! The 'support' directory contains texinfo.tex 2.115, which will be necessary for printing the manual, and the texindex.c program from the texinfo distribution which is also necessary. See the makefile for the steps needed to get a DVI file from the manual. *************** *** 93,99 **** Arnold Robbins 1736 Reindeer Drive ! Atlanta, GA, 30329, USA INTERNET: arnold@skeeve.atl.ga.us UUCP: { gatech, emory, emoryu1 }!skeeve!arnold --- 95,101 ---- Arnold Robbins 1736 Reindeer Drive ! Atlanta, GA, 30329-3528, USA INTERNET: arnold@skeeve.atl.ga.us UUCP: { gatech, emory, emoryu1 }!skeeve!arnold *************** *** 115,122 **** Atari ST: Michal Jaegermann ! NTOMCZAK@vm.ucs.UAlberta.CA (e-mail only) OS/2: Kai Uwe Rommel rommel@ars.muc.de (e-mail only) --- 117,126 ---- Atari ST: Michal Jaegermann ! michal@gortel.phys.ualberta.ca (e-mail only) OS/2: Kai Uwe Rommel rommel@ars.muc.de (e-mail only) + Darrel Hankerson + hankedr@mail.auburn.edu (e-mail only) diff -crN gawk-2.15.3/README.hpux8x gawk-2.15.4/README.hpux8x *** gawk-2.15.3/README.hpux8x Sat May 1 23:29:48 1993 --- gawk-2.15.4/README.hpux8x Wed Dec 29 10:58:33 1993 *************** *** 2,4 **** --- 2,13 ---- option: "+Obb1000". The file awktab.c fails to compile without this option. The option sets the maximum number of basic blocks allowed in a function to 1000. + ------------------------------------ + + December 1993 - arnold@skeeve.atl.ga.us + + I continue to get reports that gawk compiled with gcc (through 2.5.x) on the + hp9000 series 700 systems does not pass its test suite. When compiled with + hp's C compiler, it does fine. If you compile with cc and it fails, then + let me hear about it please. If you compile with gcc and then complain to + me, I'm going to fuss at you for not reading the documenation. (:-) diff -crN gawk-2.15.3/README.pc gawk-2.15.4/README.pc *** gawk-2.15.3/README.pc Fri Oct 15 15:49:10 1993 --- gawk-2.15.4/README.pc Thu Nov 25 10:35:15 1993 *************** *** 49,64 **** and DOS) and GNU bash (OS/2 2.x) are good choices. Stewartson's shell 2.2 is in beta (as of 17-Sep-93). Earlier versions can be found at ! ftp.informatik.tu-muenchen.de:pub/comp.os/os2/shells/sh164-2.zip oak.oakland.edu:pub/msdos/sysutl/ms_sh21b.zip ftp-os2.cdrom.com:pub/os2/all/unix/shells/ms_sh21c.zip ! OS/2 HPFS users should obtain ms_sh21c.zip over ms_sh21b.zip. ksh ! may be another possibility, available from ftp.informatik.tu-muenchen.de. Bash for OS/2 2.x can be found at ! ftp.informatik.tu-muenchen.de:pub/comp.os/os2/gnu/gnubash.zip ftp-os2.cdrom.com:pub/os2/2_x/unix/shells/bash.zip --- 49,66 ---- and DOS) and GNU bash (OS/2 2.x) are good choices. Stewartson's shell 2.2 is in beta (as of 17-Sep-93). Earlier versions can be found at ! ftp.informatik.tu-muenchen.de:pub/comp/os/os2/shells/sh164-2.zip oak.oakland.edu:pub/msdos/sysutl/ms_sh21b.zip ftp-os2.cdrom.com:pub/os2/all/unix/shells/ms_sh21c.zip ! OS/2 HPFS users should obtain ms_sh21c.zip over ms_sh21b.zip. ! The Korn shell (ksh) may be another possibility: ! ! ftp.informatik.tu-muenchen.de:pub/comp/os/os2/shells/ksh48.zip Bash for OS/2 2.x can be found at ! ftp.informatik.tu-muenchen.de:pub/comp/os/os2/shells/gnu/gnubash.zip ftp-os2.cdrom.com:pub/os2/2_x/unix/shells/bash.zip *************** *** 70,76 **** 3. dmake is by Dennis Vadura (dvadura@watdragon.uwaterloo.ca), CS Dept., University of Waterloo. An OS/2 and DOS version can be found at ! ftp.informatik.tu-muenchen.de:pub/comp.os/os2/devtools/dmake38.zip ftp-os2.cdrom.com:pub/os2/all/program/dmake38x.zip Ndmake is by D.G. Kneller. This ShareWare program was later released --- 72,78 ---- 3. dmake is by Dennis Vadura (dvadura@watdragon.uwaterloo.ca), CS Dept., University of Waterloo. An OS/2 and DOS version can be found at ! ftp.informatik.tu-muenchen.de:pub/comp/os/os2/devtools/dmake38.zip ftp-os2.cdrom.com:pub/os2/all/program/dmake38x.zip Ndmake is by D.G. Kneller. This ShareWare program was later released *************** *** 81,87 **** GNU make is from the FSF. An OS/2 2.x version can be found at ! ftp.informatik.tu-muenchen.de:pub/comp.os/os2/devtools/gnumake.zip The "lookup" feature of ncftp reports: --- 83,89 ---- GNU make is from the FSF. An OS/2 2.x version can be found at ! ftp.informatik.tu-muenchen.de:pub/comp/os/os2/devtools/gnumake.zip The "lookup" feature of ncftp reports: diff -crN gawk-2.15.3/README.sgi gawk-2.15.4/README.sgi *** gawk-2.15.3/README.sgi Wed Dec 31 19:00:00 1969 --- gawk-2.15.4/README.sgi Thu Jan 20 22:19:21 1994 *************** *** 0 **** --- 1,7 ---- + January 1994 + + I have reports that gawk fails to pass its tests when compiled with gcc on + the IRIX, through gcc 2.5.7. Therefore you should probably try compiling + with cc. If gawk compiled with cc fails to pass its `make test', then I want + to hear about it. Otherwise I will fuss at you for not reading the + documentation. diff -crN gawk-2.15.3/array.c gawk-2.15.4/array.c *** gawk-2.15.3/array.c Tue Nov 2 06:33:23 1993 --- gawk-2.15.4/array.c Sat Jan 15 22:41:42 1994 *************** *** 3,9 **** */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 23,31 **** --- 23,46 ---- * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + /* + * Tree walks (``for (iggy in foo)'') and array deletions use expensive + * linear searching. So what we do is start out with small arrays and + * grow them as needed, so that our arrays are hopefully small enough, + * most of the time, that they're pretty full and we're not looking at + * wasted space. + * + * The decision is made to grow the array if the average chain length is + * ``too big''. This is defined as the total number of entries in the table + * divided by the size of the array being greater than some constant. + */ + + #define AVG_CHAIN_MAX 10 /* don't want to linear search more than this */ + #include "awk.h" static NODE *assoc_find P((NODE *symbol, NODE *subs, int hash1)); + static void grow_table P((NODE *symbol)); NODE * concat_exp(tree) *************** *** 84,90 **** if (symbol->var_array == 0) return; ! for (i = 0; i < HASHSIZE; i++) { for (bucket = symbol->var_array[i]; bucket; bucket = next) { next = bucket->ahnext; unref(bucket->ahname); --- 99,105 ---- if (symbol->var_array == 0) return; ! for (i = 0; i < symbol->array_size; i++) { for (bucket = symbol->var_array[i]; bucket; bucket = next) { next = bucket->ahnext; unref(bucket->ahname); *************** *** 93,109 **** } symbol->var_array[i] = 0; } } /* * calculate the hash function of the string in subs */ unsigned int ! hash(s, len) ! register char *s; register size_t len; { ! register unsigned long h = 0, g; while (len--) { h = (h << 4) + *s++; --- 108,132 ---- } symbol->var_array[i] = 0; } + free(symbol->var_array); + symbol->var_array = NULL; + symbol->array_size = symbol->table_size = 0; } /* * calculate the hash function of the string in subs */ unsigned int ! hash(s, len, hsize) ! register const char *s; register size_t len; + unsigned long hsize; { ! register unsigned long h = 0; ! ! #ifdef this_is_really_slow ! ! register unsigned long g; while (len--) { h = (h << 4) + *s++; *************** *** 113,122 **** h = h ^ g; } } ! if (h < HASHSIZE) ! return h; ! else ! return h%HASHSIZE; } /* --- 136,219 ---- h = h ^ g; } } ! ! #else /* this_is_really_slow */ ! /* ! * This is INCREDIBLY ugly, but fast. We break the string up into 8 byte ! * units. On the first time through the loop we get the "leftover bytes" ! * (strlen % 8). On every other iteration, we perform 8 HASHC's so we handle ! * all 8 bytes. Essentially, this saves us 7 cmp & branch instructions. If ! * this routine is heavily used enough, it's worth the ugly coding. ! * ! * OZ's original sdbm hash, copied from Margo Seltzers db package. ! * ! */ ! ! /* Even more speed: */ ! /* #define HASHC h = *s++ + 65599 * h */ ! /* Because 65599 = pow(2,6) + pow(2,16) - 1 we multiply by shifts */ ! #define HASHC htmp = (h << 6); \ ! h = *s++ + htmp + (htmp << 10) - h ! ! unsigned long htmp; ! ! h = 0; ! ! #if defined(VAXC) ! /* ! * [This was an implementation of "Duff's Device", but it has been ! * redone, separating the switch for extra iterations from the loop. ! * This is necessary because the DEC VAX-C compiler is STOOPID.] ! */ ! switch (len & (8 - 1)) { ! case 7: HASHC; ! case 6: HASHC; ! case 5: HASHC; ! case 4: HASHC; ! case 3: HASHC; ! case 2: HASHC; ! case 1: HASHC; ! default: break; ! } ! ! if (len > (8 - 1)) { ! register size_t loop = len >> 3; ! do { ! HASHC; ! HASHC; ! HASHC; ! HASHC; ! HASHC; ! HASHC; ! HASHC; ! HASHC; ! } while (--loop); ! } ! #else /* !VAXC */ ! /* "Duff's Device" for those who can handle it */ ! if (len > 0) { ! register size_t loop = (len + 8 - 1) >> 3; ! ! switch (len & (8 - 1)) { ! case 0: ! do { /* All fall throughs */ ! HASHC; ! case 7: HASHC; ! case 6: HASHC; ! case 5: HASHC; ! case 4: HASHC; ! case 3: HASHC; ! case 2: HASHC; ! case 1: HASHC; ! } while (--loop); ! } ! } ! #endif /* !VAXC */ ! #endif /* this_is_really_slow - not */ ! ! if (h >= hsize) ! h %= hsize; ! return h; } /* *************** *** 158,164 **** if (symbol->var_array == 0) return 0; subs = concat_exp(subs); /* concat_exp returns a string node */ ! hash1 = hash(subs->stptr, subs->stlen); if (assoc_find(symbol, subs, hash1) == NULL) { free_temp(subs); return 0; --- 255,261 ---- if (symbol->var_array == 0) return 0; subs = concat_exp(subs); /* concat_exp returns a string node */ ! hash1 = hash(subs->stptr, subs->stlen, (unsigned long) symbol->array_size); if (assoc_find(symbol, subs, hash1) == NULL) { free_temp(subs); return 0; *************** *** 183,199 **** register NODE *bucket; (void) force_string(subs); - hash1 = hash(subs->stptr, subs->stlen); ! if (symbol->var_array == 0) { /* this table really should grow ! * dynamically */ ! size_t size; ! ! size = sizeof(NODE *) * HASHSIZE; ! emalloc(symbol->var_array, NODE **, size, "assoc_lookup"); ! memset((char *)symbol->var_array, 0, size); symbol->type = Node_var_array; } else { bucket = assoc_find(symbol, subs, hash1); if (bucket != NULL) { free_temp(subs); --- 280,295 ---- register NODE *bucket; (void) force_string(subs); ! if (symbol->var_array == 0) { symbol->type = Node_var_array; + symbol->array_size = symbol->table_size = 0; /* sanity */ + grow_table(symbol); + hash1 = hash(subs->stptr, subs->stlen, + (unsigned long) symbol->array_size); } else { + hash1 = hash(subs->stptr, subs->stlen, + (unsigned long) symbol->array_size); bucket = assoc_find(symbol, subs, hash1); if (bucket != NULL) { free_temp(subs); *************** *** 205,210 **** --- 301,317 ---- if (do_lint && subs->stlen == 0) warning("subscript of array `%s' is null string", symbol->vname); + + /* first see if we would need to grow the array, before installing */ + symbol->table_size++; + if ((symbol->flags & ARRAYMAXED) == 0 + && symbol->table_size/symbol->array_size > AVG_CHAIN_MAX) { + grow_table(symbol); + /* have to recompute hash value for new size */ + hash1 = hash(subs->stptr, subs->stlen, + (unsigned long) symbol->array_size); + } + getnode(bucket); bucket->type = Node_ahash; if (subs->flags & TEMP) *************** *** 240,246 **** if (symbol->var_array == 0) return; subs = concat_exp(tree); /* concat_exp returns string node */ ! hash1 = hash(subs->stptr, subs->stlen); last = NULL; for (bucket = symbol->var_array[hash1]; bucket; last = bucket, bucket = bucket->ahnext) --- 347,353 ---- if (symbol->var_array == 0) return; subs = concat_exp(tree); /* concat_exp returns string node */ ! hash1 = hash(subs->stptr, subs->stlen, (unsigned long) symbol->array_size); last = NULL; for (bucket = symbol->var_array[hash1]; bucket; last = bucket, bucket = bucket->ahnext) *************** *** 256,261 **** --- 363,376 ---- unref(bucket->ahname); unref(bucket->ahvalue); freenode(bucket); + symbol->table_size--; + if (symbol->table_size <= 0) { + memset(symbol->var_array, '\0', + sizeof(NODE *) * symbol->array_size); + symbol->table_size = symbol->array_size = 0; + free(symbol->var_array); + symbol->var_array = NULL; + } } void *************** *** 263,274 **** NODE *symbol; struct search *lookat; { ! if (!symbol->var_array) { lookat->retval = NULL; return; } lookat->arr_ptr = symbol->var_array; ! lookat->arr_end = lookat->arr_ptr + HASHSIZE; /* added */ lookat->bucket = symbol->var_array[0]; assoc_next(lookat); } --- 378,389 ---- NODE *symbol; struct search *lookat; { ! if (symbol->var_array == NULL) { lookat->retval = NULL; return; } lookat->arr_ptr = symbol->var_array; ! lookat->arr_end = lookat->arr_ptr + symbol->array_size; lookat->bucket = symbol->var_array[0]; assoc_next(lookat); } *************** *** 290,293 **** --- 405,482 ---- lookat->retval = NULL; } return; + } + + /* grow_table --- grow a hash table */ + + static void + grow_table(symbol) + NODE *symbol; + { + NODE **old, **new, *chain, *next; + int i, j; + unsigned long hash1; + unsigned long oldsize, newsize; + /* + * This is an array of primes. We grow the table by an order of + * magnitude each time (not just doubling) so that growing is a + * rare operation. We expect, on average, that it won't happen + * more than twice. The final size is also chosen to be small + * enough so that MS-DOG mallocs can handle it. When things are + * very large (> 8K), we just double more or less, instead of + * just jumping from 8K to 64K. + */ + static long sizes[] = { 13, 127, 1021, 8191, 16381, 32749, 65497 }; + + /* find next biggest hash size */ + oldsize = symbol->array_size; + newsize = 0; + for (i = 0, j = sizeof(sizes)/sizeof(sizes[0]); i < j; i++) { + if (oldsize < sizes[i]) { + newsize = sizes[i]; + break; + } + } + + if (newsize == oldsize) { /* table already at max (!) */ + symbol->flags |= ARRAYMAXED; + return; + } + + /* allocate new table */ + emalloc(new, NODE **, newsize * sizeof(NODE *), "grow_table"); + memset(new, '\0', newsize * sizeof(NODE *)); + + /* brand new hash table, set things up and return */ + if (symbol->var_array == NULL) { + symbol->table_size = 0; + goto done; + } + + /* old hash table there, move stuff to new, free old */ + old = symbol->var_array; + for (i = 0; i < oldsize; i++) { + if (old[i] == NULL) + continue; + + for (chain = old[i]; chain != NULL; chain = next) { + next = chain->ahnext; + hash1 = hash(chain->ahname->stptr, + chain->ahname->stlen, newsize); + + /* remove from old list, add to new */ + chain->ahnext = new[hash1]; + new[hash1] = chain; + + } + } + free(old); + + done: + /* + * note that symbol->table_size does not change if an old array, + * and is explicitly set to 0 if a new one. + */ + symbol->var_array = new; + symbol->array_size = newsize; } diff -crN gawk-2.15.3/atari/Makefile.st gawk-2.15.4/atari/Makefile.st *** gawk-2.15.3/atari/Makefile.st Tue Nov 2 06:30:46 1993 --- gawk-2.15.4/atari/Makefile.st Wed Dec 29 10:43:42 1993 *************** *** 5,11 **** # and modified system(). # Check comments in this Makefile and adjust to your needs!! # ! # Copyright (C) 1986, 1988-1992 the Free Software Foundation, Inc. # # This file is part of GAWK, the GNU implementation of the # AWK Progamming Language. --- 5,11 ---- # and modified system(). # Check comments in this Makefile and adjust to your needs!! # ! # Copyright (C) 1986, 1988-1993 the Free Software Foundation, Inc. # # This file is part of GAWK, the GNU implementation of the # AWK Progamming Language. *************** *** 58,64 **** # GNUOBJS # GNU stuff that gawk uses as library routines. ! GNUOBJS= rx.o dfa.o all: gawk.ttp --- 58,65 ---- # GNUOBJS # GNU stuff that gawk uses as library routines. ! REGEX = regex ! GNUOBJS= $(REGEX).o dfa.o all: gawk.ttp *************** *** 68,74 **** toglclr -fload $@ # xstrip -k $@ ! $(AWKOBJS) $(GNUOBJS): awk.h dfa.h regex.h config.h # cheat with defines to force an inclusion of a proper code getopt.o: getopt.h --- 69,75 ---- toglclr -fload $@ # xstrip -k $@ ! $(AWKOBJS) $(GNUOBJS): awk.h dfa.h $(REGEX).h config.h # cheat with defines to force an inclusion of a proper code getopt.o: getopt.h diff -crN gawk-2.15.3/atari/config.h gawk-2.15.4/atari/config.h *** gawk-2.15.3/atari/config.h Tue Nov 2 06:33:31 1993 --- gawk-2.15.4/atari/config.h Wed Dec 8 23:06:29 1993 *************** *** 59,63 **** #undef _MINIX /* on Minix, used to get Posix functions */ #undef _POSIX_1_SOURCE /* on Minix, define to 2 */ ! ! --- 59,67 ---- #undef _MINIX /* on Minix, used to get Posix functions */ #undef _POSIX_1_SOURCE /* on Minix, define to 2 */ ! /* ! * define const to nothing if not __STDC__ ! */ ! #ifndef __STDC__ ! #define const ! #endif diff -crN gawk-2.15.3/awk.h gawk-2.15.4/awk.h *** gawk-2.15.3/awk.h Sun Nov 7 10:51:52 1993 --- gawk-2.15.4/awk.h Tue Jan 4 16:27:48 1994 *************** *** 3,9 **** */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 172,178 **** typedef struct Regexp { struct re_pattern_buffer pat; struct re_registers regs; ! struct regexp dfareg; int dfa; } Regexp; #define RESTART(rp,s) (rp)->regs.start[0] --- 172,178 ---- typedef struct Regexp { struct re_pattern_buffer pat; struct re_registers regs; ! struct dfa dfareg; int dfa; } Regexp; #define RESTART(rp,s) (rp)->regs.start[0] *************** *** 196,201 **** --- 196,217 ---- #define ENVSEP ':' #endif + #define DEFAULT_G_PRECISION 6 + + /* semi-temporary hack, mostly to gracefully handle VMS */ + #ifdef GFMT_WORKAROUND + extern void sgfmt P((char *, const char *, int, int, int, double)); /* builtin.c */ + + /* Partial fix, to handle the most common case. */ + #define NUMTOSTR(str, format, num) \ + if (strcmp((format), "%.6g") == 0 || strcmp((format), "%g") == 0) \ + sgfmt(str, "%*.*g", 0, 1, DEFAULT_G_PRECISION, num); \ + else \ + (void) sprintf(str, format, num) /* NOTE: no semi-colon! */ + #else + #define NUMTOSTR(str, format, num) (void) sprintf(str, format, num) + #endif /* GFMT_WORKAROUND */ + /* ------------------ Constants, Structures, Typedefs ------------------ */ #define AWKNUM double *************** *** 333,338 **** --- 349,355 ---- union { struct exp_node *lptr; char *param_name; + long ll; } l; union { struct exp_node *rptr; *************** *** 345,350 **** --- 362,368 ---- union { char *name; struct exp_node *extra; + long xl; } x; short number; unsigned char reflags; *************** *** 390,397 **** # define NUM 32 /* numeric value is current */ # define NUMBER 64 /* assigned as number */ # define MAYBE_NUM 128 /* user input: if NUMERIC then ! * a NUMBER ! */ char *vname; /* variable's name */ } NODE; --- 408,415 ---- # define NUM 32 /* numeric value is current */ # define NUMBER 64 /* assigned as number */ # define MAYBE_NUM 128 /* user input: if NUMERIC then ! * a NUMBER */ ! # define ARRAYMAXED 256 /* array is at max size */ char *vname; /* variable's name */ } NODE; *************** *** 424,429 **** --- 442,449 ---- #define var_value lnode #define var_array sub.nodep.r.av + #define array_size sub.nodep.l.ll + #define table_size sub.nodep.x.xl #define condpair lnode #define triggered sub.nodep.r.r_ent *************** *** 431,438 **** #ifdef DONTDEF int primes[] = {31, 61, 127, 257, 509, 1021, 2053, 4099, 8191, 16381}; #endif - /* a quick profile suggests that the following is a good value */ - #define HASHSIZE 1021 typedef struct for_loop_header { NODE *init; --- 451,456 ---- *************** *** 626,632 **** /* array.c */ extern NODE *concat_exp P((NODE *tree)); extern void assoc_clear P((NODE *symbol)); ! extern unsigned int hash P((char *s, size_t len)); extern int in_array P((NODE *symbol, NODE *subs)); extern NODE **assoc_lookup P((NODE *symbol, NODE *subs)); extern void do_delete P((NODE *symbol, NODE *tree)); --- 644,650 ---- /* array.c */ extern NODE *concat_exp P((NODE *tree)); extern void assoc_clear P((NODE *symbol)); ! extern unsigned int hash P((const char *s, size_t len, unsigned long hsize)); extern int in_array P((NODE *symbol, NODE *subs)); extern NODE **assoc_lookup P((NODE *symbol, NODE *subs)); extern void do_delete P((NODE *symbol, NODE *tree)); *************** *** 637,643 **** extern char nextc P((void)); extern NODE *node P((NODE *left, NODETYPE op, NODE *right)); extern NODE *install P((char *name, NODE *value)); ! extern NODE *lookup P((char *name)); extern NODE *variable P((char *name, int can_free)); extern int yyparse P((void)); /* builtin.c */ --- 655,661 ---- extern char nextc P((void)); extern NODE *node P((NODE *left, NODETYPE op, NODE *right)); extern NODE *install P((char *name, NODE *value)); ! extern NODE *lookup P((const char *name)); extern NODE *variable P((char *name, int can_free)); extern int yyparse P((void)); /* builtin.c */ *************** *** 693,700 **** extern NODE *do_close P((NODE *tree)); extern int flush_io P((void)); extern int close_io P((void)); ! extern int devopen P((char *name, char *mode)); ! extern int pathopen P((char *file)); extern NODE *do_getline P((NODE *tree)); extern void do_nextfile P((void)); /* iop.c */ --- 711,718 ---- extern NODE *do_close P((NODE *tree)); extern int flush_io P((void)); extern int close_io P((void)); ! extern int devopen P((const char *name, const char *mode)); ! extern int pathopen P((const char *file)); extern NODE *do_getline P((NODE *tree)); extern void do_nextfile P((void)); /* iop.c */ *************** *** 708,714 **** extern char *arg_assign P((char *arg)); extern SIGTYPE catchsig P((int sig, int code)); /* msg.c */ ! extern void err P((char *s, char *emsg, va_list argp)); #if _MSC_VER == 510 extern void msg P((va_list va_alist, ...)); extern void warning P((va_list va_alist, ...)); --- 726,732 ---- extern char *arg_assign P((char *arg)); extern SIGTYPE catchsig P((int sig, int code)); /* msg.c */ ! extern void err P((const char *s, const char *emsg, va_list argp)); #if _MSC_VER == 510 extern void msg P((va_list va_alist, ...)); extern void warning P((va_list va_alist, ...)); *************** *** 732,739 **** extern void unref P((NODE *tmp)); extern int parse_escape P((char **string_ptr)); /* re.c */ ! extern Regexp *make_regexp P((char *s, int len, int ignorecase, int dfa)); ! extern int research P((Regexp *rp, char *str, int start, int len, int need_start)); extern void refree P((Regexp *rp)); extern void reg_error P((const char *s)); extern Regexp *re_update P((NODE *t)); --- 750,758 ---- extern void unref P((NODE *tmp)); extern int parse_escape P((char **string_ptr)); /* re.c */ ! extern Regexp *make_regexp P((char *s, size_t len, int ignorecase, int dfa)); ! extern int research P((Regexp *rp, char *str, int start, ! size_t len, int need_start)); extern void refree P((Regexp *rp)); extern void reg_error P((const char *s)); extern Regexp *re_update P((NODE *t)); diff -crN gawk-2.15.3/awk.y gawk-2.15.4/awk.y *** gawk-2.15.3/awk.y Sat Nov 6 22:20:03 1993 --- gawk-2.15.4/awk.y Tue Jan 4 16:18:18 1994 *************** *** 3,9 **** */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 56,64 **** #define YYDEBUG_LEXER_TEXT (lexeme) static int param_counter; static char *tokstart = NULL; ! static char *token = NULL; static char *tokend; NODE *variables[HASHSIZE]; extern char *source; --- 56,65 ---- #define YYDEBUG_LEXER_TEXT (lexeme) static int param_counter; static char *tokstart = NULL; ! static char *tok = NULL; static char *tokend; + #define HASHSIZE 1021 /* this constant only used here */ NODE *variables[HASHSIZE]; extern char *source; *************** *** 291,297 **** REGEXP '/' { NODE *n; ! int len; getnode(n); n->type = Node_regex; --- 292,298 ---- REGEXP '/' { NODE *n; ! size_t len; getnode(n); n->type = Node_regex; *************** *** 386,395 **** if ($2 && $2 == lookup("file")) { if (do_lint) warning("`next file' is a gawk extension"); ! else if (do_unix || do_posix) ! yyerror("`next file' is a gawk extension"); ! else if (! io_allowed) ! yyerror("`next file' used in BEGIN or END action"); type = Node_K_nextfile; } else { if (! io_allowed) --- 387,405 ---- if ($2 && $2 == lookup("file")) { if (do_lint) warning("`next file' is a gawk extension"); ! if (do_unix || do_posix) { ! /* ! * can't use yyerror, since may have overshot ! * the source line ! */ ! errcount++; ! msg("`next file' is a gawk extension"); ! } ! if (! io_allowed) { ! /* same thing */ ! errcount++; ! msg("`next file' used in BEGIN or END action"); ! } type = Node_K_nextfile; } else { if (! io_allowed) *************** *** 406,411 **** --- 416,435 ---- { $$ = node ($3, Node_K_return, (NODE *)NULL); } | LEX_DELETE NAME '[' expression_list ']' statement_term { $$ = node (variable($2,1), Node_K_delete, $4); } + | LEX_DELETE NAME statement_term + { + if (do_lint) + warning("`delete array' is a gawk extension"); + if (do_unix || do_posix) { + /* + * can't use yyerror, since may have overshot + * the source line + */ + errcount++; + msg("`delete array' is a gawk extension"); + } + $$ = node (variable($2,1), Node_K_delete, (NODE *) NULL); + } | exp statement_term { $$ = $1; } ; *************** *** 746,752 **** %% struct token { ! char *operator; /* text to match */ NODETYPE value; /* node type */ int class; /* lexical class */ unsigned flags; /* # of args. allowed and compatability */ --- 770,776 ---- %% struct token { ! const char *operator; /* text to match */ NODETYPE value; /* node type */ int class; /* lexical class */ unsigned flags; /* # of args. allowed and compatability */ *************** *** 820,829 **** va_dcl { va_list args; ! char *mesg = NULL; register char *bp, *cp; char *scan; char buf[120]; errcount++; /* Find the current line in the input file */ --- 844,854 ---- va_dcl { va_list args; ! const char *mesg = NULL; register char *bp, *cp; char *scan; char buf[120]; + static char end_of_file_line[] = "(END OF FILE)"; errcount++; /* Find the current line in the input file */ *************** *** 845,852 **** while (bp < lexend && *bp && *bp != '\n') bp++; } else { ! thisline = "(END OF FILE)"; ! bp = thisline + 13; } msg("%.*s", (int) (bp - thisline), thisline); bp = buf; --- 870,877 ---- while (bp < lexend && *bp && *bp != '\n') bp++; } else { ! thisline = end_of_file_line; ! bp = thisline + strlen(thisline); } msg("%.*s", (int) (bp - thisline), thisline); bp = buf; *************** *** 982,988 **** return buf; } ! #define tokadd(x) (*token++ = (x), token == tokend ? tokexpand() : token) char * tokexpand() --- 1007,1013 ---- return buf; } ! #define tokadd(x) (*tok++ = (x), tok == tokend ? tokexpand() : tok) char * tokexpand() *************** *** 990,1004 **** static int toksize = 60; int tokoffset; ! tokoffset = token - tokstart; toksize *= 2; if (tokstart) erealloc(tokstart, char *, toksize, "tokexpand"); else emalloc(tokstart, char *, toksize, "tokexpand"); tokend = tokstart + toksize; ! token = tokstart + tokoffset; ! return token; } #if DEBUG --- 1015,1029 ---- static int toksize = 60; int tokoffset; ! tokoffset = tok - tokstart; toksize *= 2; if (tokstart) erealloc(tokstart, char *, toksize, "tokexpand"); else emalloc(tokstart, char *, toksize, "tokexpand"); tokend = tokstart + toksize; ! tok = tokstart + tokoffset; ! return tok; } #if DEBUG *************** *** 1053,1059 **** int in_brack = 0; want_regexp = 0; ! token = tokstart; while ((c = nextc()) != 0) { switch (c) { case '[': --- 1078,1084 ---- int in_brack = 0; want_regexp = 0; ! tok = tokstart; while ((c = nextc()) != 0) { switch (c) { case '[': *************** *** 1094,1100 **** lexeme = lexptr ? lexptr - 1 : lexptr; thisline = NULL; ! token = tokstart; yylval.nodetypeval = Node_illegal; switch (c) { --- 1119,1125 ---- lexeme = lexptr ? lexptr - 1 : lexptr; thisline = NULL; ! tok = tokstart; yylval.nodetypeval = Node_illegal; switch (c) { *************** *** 1115,1127 **** case '\\': #ifdef RELAXED_CONTINUATION ! if (!do_unix) { /* strip trailing white-space and/or comment */ ! while ((c = nextc()) == ' ' || c == '\t') continue; if (c == '#') ! while ((c = nextc()) != '\n') if (!c) break; pushback(); } ! #endif /*RELAXED_CONTINUATION*/ if (nextc() == '\n') { sourceline++; goto retry; --- 1140,1162 ---- case '\\': #ifdef RELAXED_CONTINUATION ! /* ! * This code puports to allow comments and/or whitespace ! * after the `\' at the end of a line used for continuation. ! * Use it at your own risk. We think it's a bad idea, which ! * is why it's not on by default. ! */ ! if (!do_unix) { ! /* strip trailing white-space and/or comment */ ! while ((c = nextc()) == ' ' || c == '\t') ! continue; if (c == '#') ! while ((c = nextc()) != '\n') ! if (c == '\0') ! break; pushback(); } ! #endif /* RELAXED_CONTINUATION */ if (nextc() == '\n') { sourceline++; goto retry; *************** *** 1307,1313 **** tokadd(c); } yylval.nodeval = make_str_node(tokstart, ! token - tokstart, esc_seen ? SCAN : 0); yylval.nodeval->flags |= PERM; return YSTRING; --- 1342,1348 ---- tokadd(c); } yylval.nodeval = make_str_node(tokstart, ! tok - tokstart, esc_seen ? SCAN : 0); yylval.nodeval->flags |= PERM; return YSTRING; *************** *** 1443,1456 **** yyerror("Invalid char '%c' in expression\n", c); /* it's some type of name-type-thing. Find its length */ ! token = tokstart; while (is_identchar(c)) { tokadd(c); c = nextc(); } tokadd('\0'); ! emalloc(tokkey, char *, token - tokstart, "yylex"); ! memcpy(tokkey, tokstart, token - tokstart); pushback(); /* See if it is a special token. */ --- 1478,1491 ---- yyerror("Invalid char '%c' in expression\n", c); /* it's some type of name-type-thing. Find its length */ ! tok = tokstart; while (is_identchar(c)) { tokadd(c); c = nextc(); } tokadd('\0'); ! emalloc(tokkey, char *, tok - tokstart, "yylex"); ! memcpy(tokkey, tokstart, tok - tokstart); pushback(); /* See if it is a special token. */ *************** *** 1653,1659 **** register int bucket; len = strlen(name); ! bucket = hash(name, len); getnode(hp); hp->type = Node_hashnode; hp->hnext = variables[bucket]; --- 1688,1694 ---- register int bucket; len = strlen(name); ! bucket = hash(name, len, (unsigned long) HASHSIZE); getnode(hp); hp->type = Node_hashnode; hp->hnext = variables[bucket]; *************** *** 1668,1680 **** /* find the most recent hash node for name installed by install */ NODE * lookup(name) ! char *name; { register NODE *bucket; register size_t len; len = strlen(name); ! bucket = variables[hash(name, len)]; while (bucket) { if (bucket->hlength == len && STREQN(bucket->hname, name, len)) return bucket->hvalue; --- 1703,1715 ---- /* find the most recent hash node for name installed by install */ NODE * lookup(name) ! const char *name; { register NODE *bucket; register size_t len; len = strlen(name); ! bucket = variables[hash(name, len, (unsigned long) HASHSIZE)]; while (bucket) { if (bucket->hlength == len && STREQN(bucket->hname, name, len)) return bucket->hvalue; *************** *** 1738,1744 **** name = np->param; len = strlen(name); ! save = &(variables[hash(name, len)]); for (bucket = *save; bucket; bucket = bucket->hnext) { if (len == bucket->hlength && STREQN(bucket->hname, name, len)) { *save = bucket->hnext; --- 1773,1779 ---- name = np->param; len = strlen(name); ! save = &(variables[hash(name, len, (unsigned long) HASHSIZE)]); for (bucket = *save; bucket; bucket = bucket->hnext) { if (len == bucket->hlength && STREQN(bucket->hname, name, len)) { *save = bucket->hnext; diff -crN gawk-2.15.3/builtin.c gawk-2.15.4/builtin.c *** gawk-2.15.3/builtin.c Sun Nov 7 10:25:05 1993 --- gawk-2.15.4/builtin.c Fri Jan 7 14:24:48 1994 *************** *** 3,9 **** */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 26,32 **** #include "awk.h" - #ifndef SRANDOM_PROTO extern void srandom P((int seed)); #endif --- 26,31 ---- *************** *** 41,50 **** static NODE *sub_common P((NODE *tree, int global)); - #ifdef GFMT_WORKAROUND - char *gfmt P((double g, int prec, char *buf)); - #endif - #ifdef _CRAY /* Work around a problem in conversion of doubles to exact integers. */ #include --- 40,45 ---- *************** *** 63,82 **** #define Ceil(n) ceil(n) #endif ! #if __STDC__ ! static void ! efwrite(void *ptr, size_t size, size_t count, FILE *fp, ! char *from, struct redirect *rp,int flush) ! #else static void efwrite(ptr, size, count, fp, from, rp, flush) ! void *ptr; size_t size, count; FILE *fp; ! char *from; struct redirect *rp; int flush; - #endif { errno = 0; if (fwrite(ptr, size, count, fp) != count) --- 58,75 ---- #define Ceil(n) ceil(n) #endif ! ! static void efwrite P((const void *ptr, size_t size, size_t count, FILE *fp, ! const char *from, struct redirect *rp,int flush)); ! static void efwrite(ptr, size, count, fp, from, rp, flush) ! const void *ptr; size_t size, count; FILE *fp; ! const char *from; struct redirect *rp; int flush; { errno = 0; if (fwrite(ptr, size, count, fp) != count) *************** *** 217,239 **** return tmp_number((AWKNUM) d); } ! /* %e and %f formats are not properly implemented. Someone should fix them */ ! /* Actually, this whole thing should be reimplemented. */ NODE * do_sprintf(tree) NODE *tree; { #define bchunk(s,l) if(l) {\ while((l)>ofre) {\ erealloc(obuf, char *, osiz*2, "do_sprintf");\ ofre+=osiz;\ osiz*=2;\ }\ ! memcpy(obuf+olen,s,(size_t)(l));\ ! olen+=(l);\ ofre-=(l);\ } /* Is there space for something L big in the buffer? */ #define chksize(l) if((l)>ofre) {\ --- 210,251 ---- return tmp_number((AWKNUM) d); } ! /* ! * do_sprintf does the sprintf function. It is one of the uglier parts of ! * gawk. Thanks to Michal Jaegerman for taming this beast and making it ! * compatible with ANSI C. ! */ NODE * do_sprintf(tree) NODE *tree; { + /* copy 'l' bytes from 's' to 'obufout' checking for space in the process */ + /* difference of pointers should be of ptrdiff_t type, but let us be kind */ #define bchunk(s,l) if(l) {\ while((l)>ofre) {\ + long olen = obufout - obuf;\ erealloc(obuf, char *, osiz*2, "do_sprintf");\ ofre+=osiz;\ osiz*=2;\ + obufout = obuf + olen;\ }\ ! memcpy(obufout,s,(size_t)(l));\ ! obufout+=(l);\ ofre-=(l);\ } + /* copy one byte from 's' to 'obufout' checking for space in the process */ + #define bchunk_one(s) {\ + if(ofre <= 0) {\ + long olen = obufout - obuf;\ + erealloc(obuf, char *, osiz*2, "do_sprintf");\ + ofre+=osiz;\ + osiz*=2;\ + obufout = obuf + olen;\ + }\ + *obufout++ = *s;\ + --ofre;\ + } /* Is there space for something L big in the buffer? */ #define chksize(l) if((l)>ofre) {\ *************** *** 256,270 **** NODE *r; int toofew = 0; ! char *obuf; ! size_t osiz, ofre, olen; ! static char chbuf[] = "0123456789abcdef"; ! static char sp[] = " "; char *s0, *s1; int n0; NODE *sfmt, *arg; register NODE *carg; ! long fw, prec, lj, alt, big; long *cur; long val; #ifdef sun386 /* Can't cast unsigned (int/long) from ptr->value */ --- 268,283 ---- NODE *r; int toofew = 0; ! char *obuf, *obufout; ! size_t osiz, ofre; ! char *chbuf; char *s0, *s1; + int cs1; int n0; NODE *sfmt, *arg; register NODE *carg; ! long fw, prec; ! int lj, alt, big; long *cur; long val; #ifdef sun386 /* Can't cast unsigned (int/long) from ptr->value */ *************** *** 278,293 **** char *cp; char *fill; double tmpval; - char *pr_str; - int ucasehex = 0; char signchar = 0; size_t len; ! emalloc(obuf, char *, 120, "do_sprintf"); osiz = 120; ofre = osiz - 1; - olen = 0; sfmt = tree_eval(tree->lnode); sfmt = force_string(sfmt); carg = tree->rnode; --- 291,307 ---- char *cp; char *fill; double tmpval; char signchar = 0; size_t len; ! static char sp[] = " "; ! static char zero_string[] = "0"; ! static char lchbuf[] = "0123456789abcdefx"; ! static char Uchbuf[] = "0123456789ABCDEFX"; emalloc(obuf, char *, 120, "do_sprintf"); + obufout = obuf; osiz = 120; ofre = osiz - 1; sfmt = tree_eval(tree->lnode); sfmt = force_string(sfmt); carg = tree->rnode; *************** *** 308,324 **** retry: --n0; ! switch (*s1++) { case '%': ! bchunk("%", 1); s0 = s1; break; case '0': ! if (fill != sp || lj) ! goto lose; if (cur == &fw) ! fill = "0"; /* FALL through */ case '1': case '2': case '3': --- 322,338 ---- retry: --n0; ! switch (cs1 = *s1++) { case '%': ! bchunk_one("%"); s0 = s1; break; case '0': ! if (lj) ! goto retry; if (cur == &fw) ! fill = zero_string; /* FALL through */ case '1': case '2': case '3': *************** *** 329,370 **** case '8': case '9': if (cur == 0) ! goto lose; ! *cur = s1[-1] - '0'; while (n0 > 0 && *s1 >= '0' && *s1 <= '9') { --n0; *cur = *cur * 10 + *s1++ - '0'; } goto retry; case '*': if (cur == 0) ! goto lose; parse_next_arg(); *cur = force_number(arg); free_temp(arg); goto retry; case ' ': /* print ' ' or '-' */ case '+': /* print '+' or '-' */ ! signchar = *(s1-1); goto retry; case '-': ! if (lj || fill != sp) ! goto lose; ! lj++; goto retry; case '.': if (cur != &fw) ! goto lose; cur = ≺ goto retry; case '#': ! if (alt) ! goto lose; alt++; goto retry; case 'l': if (big) ! goto lose; big++; goto retry; case 'c': --- 343,400 ---- case '8': case '9': if (cur == 0) ! /* goto lose; */ ! break; ! if (prec >= 0) /* this happens only when we have */ ! /* a negative precision */ ! *cur = cs1 - '0'; while (n0 > 0 && *s1 >= '0' && *s1 <= '9') { --n0; *cur = *cur * 10 + *s1++ - '0'; } + if (prec < 0) { /* negative precision is discarded */ + prec = 0; + cur = 0; + } goto retry; case '*': if (cur == 0) ! /* goto lose; */ ! break; parse_next_arg(); *cur = force_number(arg); free_temp(arg); goto retry; case ' ': /* print ' ' or '-' */ + /* 'space' flag is ignored */ + /* if '+' already present */ + if (signchar != 0) + goto retry; + /* FALL THROUGH */ case '+': /* print '+' or '-' */ ! signchar = cs1; goto retry; case '-': ! if (cur == &prec) { ! prec = -1; ! goto retry; ! } ! fill = sp; /* if left justified then other */ ! lj++; /* filling is ignored */ goto retry; case '.': if (cur != &fw) ! break; cur = ≺ goto retry; case '#': ! if (cur != &fw) ! break; alt++; goto retry; case 'l': if (big) ! break; big++; goto retry; case 'c': *************** *** 378,421 **** #endif cpbuf[0] = uval; prec = 1; ! pr_str = cpbuf; ! goto dopr_string; } ! if (! prec) prec = 1; else if (prec > arg->stlen) prec = arg->stlen; ! pr_str = arg->stptr; ! goto dopr_string; case 's': parse_next_arg(); arg = force_string(arg); ! if (!prec || prec > arg->stlen) prec = arg->stlen; ! pr_str = arg->stptr; ! ! dopr_string: ! if (fw > prec && !lj) { ! while (fw > prec) { ! bchunk(fill, 1); ! fw--; ! } ! } ! bchunk(pr_str, (int) prec); ! if (fw > prec) { ! while (fw > prec) { ! bchunk(fill, 1); ! fw--; ! } ! } ! s0 = s1; ! free_temp(arg); ! break; case 'd': case 'i': parse_next_arg(); val = (long) force_number(arg); - free_temp(arg); if (val < 0) { sgn = 1; val = -val; --- 408,433 ---- #endif cpbuf[0] = uval; prec = 1; ! cp = cpbuf; ! goto pr_tail; } ! if (prec == 0) prec = 1; else if (prec > arg->stlen) prec = arg->stlen; ! cp = arg->stptr; ! goto pr_tail; case 's': parse_next_arg(); arg = force_string(arg); ! if (prec == 0 || prec > arg->stlen) prec = arg->stlen; ! cp = arg->stptr; ! goto pr_tail; case 'd': case 'i': parse_next_arg(); val = (long) force_number(arg); if (val < 0) { sgn = 1; val = -val; *************** *** 429,458 **** *--cp = '-'; else if (signchar) *--cp = signchar; if (prec > fw) fw = prec; prec = cend - cp; ! if (fw > prec && !lj) { ! if (fill != sp && (*cp == '-' || signchar)) { ! bchunk(cp, 1); ! cp++; ! prec--; ! fw--; ! } ! while (fw > prec) { ! bchunk(fill, 1); ! fw--; ! } ! } ! bchunk(cp, (int) prec); ! if (fw > prec) { ! while (fw > prec) { ! bchunk(fill, 1); ! fw--; ! } } ! s0 = s1; ! break; case 'u': base = 10; goto pr_unsigned; --- 441,459 ---- *--cp = '-'; else if (signchar) *--cp = signchar; + if (prec != 0) /* ignore '0' flag if */ + fill = sp; /* precision given */ if (prec > fw) fw = prec; prec = cend - cp; ! if (fw > prec && ! lj && fill != sp ! && (*cp == '-' || signchar)) { ! bchunk_one(cp); ! cp++; ! prec--; ! fw--; } ! goto pr_tail; case 'u': base = 10; goto pr_unsigned; *************** *** 460,505 **** base = 8; goto pr_unsigned; case 'X': - ucasehex = 1; case 'x': base = 16; - goto pr_unsigned; pr_unsigned: parse_next_arg(); uval = (unsigned long) force_number(arg); - free_temp(arg); do { *--cp = chbuf[uval % base]; - if (ucasehex && isalpha(*cp)) - *cp = toupper(*cp); uval /= base; } while (uval); ! if (alt && (base == 8 || base == 16)) { if (base == 16) { ! if (ucasehex) ! *--cp = 'X'; ! else ! *--cp = 'x'; ! } ! *--cp = '0'; } prec = cend - cp; ! if (fw > prec && !lj) { while (fw > prec) { ! bchunk(fill, 1); fw--; } } bchunk(cp, (int) prec); ! if (fw > prec) { ! while (fw > prec) { ! bchunk(fill, 1); ! fw--; ! } } s0 = s1; break; case 'g': parse_next_arg(); tmpval = force_number(arg); free_temp(arg); --- 461,514 ---- base = 8; goto pr_unsigned; case 'X': case 'x': base = 16; pr_unsigned: + if (cs1 == 'X') + chbuf = Uchbuf; + else + chbuf = lchbuf; + if (prec != 0) /* ignore '0' flag if */ + fill = sp; /* precision given */ parse_next_arg(); uval = (unsigned long) force_number(arg); do { *--cp = chbuf[uval % base]; uval /= base; } while (uval); ! if (alt) { if (base == 16) { ! *--cp = cs1; ! *--cp = '0'; ! if (fill != sp) { ! bchunk(cp, 2); ! cp += 2; ! fw -= 2; ! } ! } else if (base == 8) ! *--cp = '0'; } prec = cend - cp; ! pr_tail: ! if (! lj) { while (fw > prec) { ! bchunk_one(fill); fw--; } } bchunk(cp, (int) prec); ! while (fw > prec) { ! bchunk_one(fill); ! fw--; } s0 = s1; + free_temp(arg); break; + case 'e': + case 'f': case 'g': + case 'E': + case 'G': parse_next_arg(); tmpval = force_number(arg); free_temp(arg); *************** *** 509,599 **** *cp++ = '%'; if (lj) *cp++ = '-'; if (fill != sp) *cp++ = '0'; #ifndef GFMT_WORKAROUND ! if (cur != &fw) { ! (void) strcpy(cp, "*.*g"); ! (void) sprintf(obuf + olen, cpbuf, (int) fw, (int) prec, (double) tmpval); ! } else { ! (void) strcpy(cp, "*g"); ! (void) sprintf(obuf + olen, cpbuf, (int) fw, (double) tmpval); ! } #else /* GFMT_WORKAROUND */ ! { ! char *gptr, gbuf[120]; ! #define DEFAULT_G_PRECISION 6 ! if (fw + prec + 9 > sizeof gbuf) { /* 9==slop */ ! emalloc(gptr, char *, fw+prec+9, "do_sprintf(gfmt)"); ! } else ! gptr = gbuf; ! (void) gfmt((double) tmpval, cur != &fw ? ! (int) prec : DEFAULT_G_PRECISION, gptr); ! *cp++ = '*', *cp++ = 's', *cp = '\0'; ! (void) sprintf(obuf + olen, cpbuf, (int) fw, gptr); ! if (fill != sp && *gptr == ' ') { ! char *p = gptr; ! do { *p++ = '0'; } while (*p == ' '); ! } ! if (gptr != gbuf) free(gptr); ! } #endif /* GFMT_WORKAROUND */ ! len = strlen(obuf + olen); ofre -= len; ! olen += len; s0 = s1; break; - - case 'f': - parse_next_arg(); - tmpval = force_number(arg); - free_temp(arg); - chksize(fw + prec + 9); /* 9==slop */ - - cp = cpbuf; - *cp++ = '%'; - if (lj) - *cp++ = '-'; - if (fill != sp) - *cp++ = '0'; - if (cur != &fw) { - (void) strcpy(cp, "*.*f"); - (void) sprintf(obuf + olen, cpbuf, (int) fw, (int) prec, (double) tmpval); - } else { - (void) strcpy(cp, "*f"); - (void) sprintf(obuf + olen, cpbuf, (int) fw, (double) tmpval); - } - len = strlen(obuf + olen); - ofre -= len; - olen += len; - s0 = s1; - break; - case 'e': - parse_next_arg(); - tmpval = force_number(arg); - free_temp(arg); - chksize(fw + prec + 9); /* 9==slop */ - cp = cpbuf; - *cp++ = '%'; - if (lj) - *cp++ = '-'; - if (fill != sp) - *cp++ = '0'; - if (cur != &fw) { - (void) strcpy(cp, "*.*e"); - (void) sprintf(obuf + olen, cpbuf, (int) fw, (int) prec, (double) tmpval); - } else { - (void) strcpy(cp, "*e"); - (void) sprintf(obuf + olen, cpbuf, (int) fw, (double) tmpval); - } - len = strlen(obuf + olen); - ofre -= len; - olen += len; - s0 = s1; - break; - default: - lose: break; } if (toofew) --- 518,551 ---- *cp++ = '%'; if (lj) *cp++ = '-'; + if (signchar) + *cp++ = signchar; + if (alt) + *cp++ = '#'; if (fill != sp) *cp++ = '0'; + cp = strcpy(cp, "*.*") + 3; + *cp++ = cs1; + *cp = '\0'; + if (prec <= 0) + prec = DEFAULT_G_PRECISION; #ifndef GFMT_WORKAROUND ! (void) sprintf(obufout, cpbuf, ! (int) fw, (int) prec, (double) tmpval); #else /* GFMT_WORKAROUND */ ! if (cs1 == 'g' || cs1 == 'G') ! (void) sgfmt(obufout, cpbuf, (int) alt, ! (int) fw, (int) prec, (double) tmpval); ! else ! (void) sprintf(obufout, cpbuf, ! (int) fw, (int) prec, (double) tmpval); #endif /* GFMT_WORKAROUND */ ! len = strlen(obufout); ofre -= len; ! obufout += len; s0 = s1; break; default: break; } if (toofew) *************** *** 607,613 **** warning("too many arguments supplied for format string"); bchunk(s0, s1 - s0); free_temp(sfmt); ! r = make_str_node(obuf, olen, ALREADY_MALLOCED); r->flags |= TEMP; return r; } --- 559,565 ---- warning("too many arguments supplied for format string"); bchunk(s0, s1 - s0); free_temp(sfmt); ! r = make_str_node(obuf, obufout - obuf, ALREADY_MALLOCED); r->flags |= TEMP; return r; } *************** *** 796,802 **** else { char buf[100]; ! sprintf(buf, OFMT, t1->numbr); t1 = tmp_string(buf, strlen(buf)); } } --- 748,755 ---- else { char buf[100]; ! NUMTOSTR(buf, OFMT, t1->numbr); ! free_temp(t1); t1 = tmp_string(buf, strlen(buf)); } } *************** *** 1125,1165 **** } #ifdef GFMT_WORKAROUND ! /* ! * printf's %g format [can't rely on gcvt()] ! * caveat: don't use as argument to *printf()! ! */ ! char * ! gfmt(g, prec, buf) ! double g; /* value to format */ ! int prec; /* indicates desired significant digits, not decimal places */ char *buf; /* return buffer; assumed big enough to hold result */ { ! if (g == 0.0) { ! (void) strcpy(buf, "0"); /* easy special case */ ! } else { ! register char *d, *e, *p; ! /* start with 'e' format (it'll provide nice exponent) */ ! if (prec < 1) prec = 1; /* at least 1 significant digit */ ! (void) sprintf(buf, "%.*e", prec - 1, g); ! if ((e = strchr(buf, 'e')) != 0) { /* find exponent */ ! int exp = atoi(e+1); /* fetch exponent */ ! if (exp >= -4 && exp < prec) { /* per K&R2, B1.2 */ ! /* switch to 'f' format and re-do */ ! prec -= (exp + 1); /* decimal precision */ ! (void) sprintf(buf, "%.*f", prec, g); ! e = buf + strlen(buf); ! } ! if ((d = strchr(buf, '.')) != 0) { ! /* remove trailing zeroes and decimal point */ ! for (p = e; p > d && *--p == '0'; ) continue; ! if (*p == '.') --p; ! if (++p < e) /* copy exponent and NUL */ ! while ((*p++ = *e++) != '\0') continue; ! } } } - return buf; } #endif /* GFMT_WORKAROUND */ --- 1078,1152 ---- } #ifdef GFMT_WORKAROUND ! /* ! * printf's %g format [can't rely on gcvt()] ! * caveat: don't use as argument to *printf()! ! * 'format' string HAS to be of "*.*g" kind, or we bomb! ! */ ! void ! sgfmt(buf, format, alt, fwidth, prec, g) char *buf; /* return buffer; assumed big enough to hold result */ + const char *format; + int alt; /* use alternate form flag */ + int fwidth; /* field width in a format */ + int prec; /* indicates desired significant digits, not decimal places */ + double g; /* value to format */ { ! char dform[40]; ! register char *gpos; ! register char *d, *e, *p; ! int again = 0; ! ! strncpy(dform, format, sizeof dform - 1); ! dform[sizeof dform - 1] = '\0'; ! gpos = strrchr(dform, '.'); ! ! if (g == 0.0 && alt == 0) { /* easy special case */ ! *gpos++ = 'd'; ! *gpos = '\0'; ! (void) sprintf(buf, dform, fwidth, 0); ! return; ! } ! gpos += 2; /* advance to location of 'g' in the format */ ! if (prec <= 0) /* negative precision is ignored */ ! prec = (prec < 0 ? DEFAULT_G_PRECISION : 1); ! ! if (*gpos == 'G') ! again = 1; ! /* start with 'e' format (it'll provide nice exponent) */ ! *gpos = 'e'; ! prec -= 1; ! (void) sprintf(buf, dform, fwidth, prec, g); ! if ((e = strrchr(buf, 'e')) != NULL) { /* find exponent */ ! int exp = atoi(e+1); /* fetch exponent */ ! if (exp >= -4 && exp <= prec) { /* per K&R2, B1.2 */ ! /* switch to 'f' format and re-do */ ! *gpos = 'f'; ! prec -= exp; /* decimal precision */ ! (void) sprintf(buf, dform, fwidth, prec, g); ! e = buf + strlen(buf); ! while (*--e == ' ') ! continue; ! e += 1; ! } ! else if (again != 0) ! *gpos = 'E'; ! ! /* if 'alt' in force, then trailing zeros are not removed */ ! if (alt == 0 && (d = strrchr(buf, '.')) != NULL) { ! /* throw away an excess of precision */ ! for (p = e; p > d && *--p == '0'; ) ! prec -= 1; ! if (d == p) ! prec -= 1; ! if (prec < 0) ! prec = 0; ! /* and do that once again */ ! again = 1; } + if (again != 0) + (void) sprintf(buf, dform, fwidth, prec, g); } } #endif /* GFMT_WORKAROUND */ diff -crN gawk-2.15.3/config/sgi405.cc gawk-2.15.4/config/sgi405.cc *** gawk-2.15.3/config/sgi405.cc Mon Oct 18 08:59:54 1993 --- gawk-2.15.4/config/sgi405.cc Thu Jan 20 22:17:32 1994 *************** *** 5,7 **** --- 5,8 ---- GETPGRP_NOARG 1 MAKE_CC MAKE_SGI + MAKE_ALLOCA_C diff -crN gawk-2.15.3/config/solaris2.cc gawk-2.15.4/config/solaris2.cc *** gawk-2.15.3/config/solaris2.cc Wed Dec 31 19:00:00 1969 --- gawk-2.15.4/config/solaris2.cc Thu Jan 13 21:15:18 1994 *************** *** 0 **** --- 1,7 ---- + Solaris 2.x Systems with cc + RANDOM_MISSING 1 + STRCASE_MISSING 1 + STDC_HEADERS 1 + SVR4 1 + MAKE_ALLOCA_C + MAKE_CC diff -crN gawk-2.15.3/config/sunos41.cc gawk-2.15.4/config/sunos41.cc *** gawk-2.15.3/config/sunos41.cc Wed Dec 31 19:00:00 1969 --- gawk-2.15.4/config/sunos41.cc Thu Nov 18 09:35:32 1993 *************** *** 0 **** --- 1,6 ---- + Sun running SunOS 4.1 + MAKE_CC + MAKE_ALLOCA_C + HAVE_UNDERSCORE_SETJMP 1 + STRERROR_MISSING 1 + NON_STD_SPRINTF 1 diff -crN gawk-2.15.3/config/vms-conf.h gawk-2.15.4/config/vms-conf.h *** gawk-2.15.3/config/vms-conf.h Sun Oct 11 13:45:35 1992 --- gawk-2.15.4/config/vms-conf.h Sun Dec 19 16:40:58 1993 *************** *** 286,292 **** * Digital's ANSI complier. */ #ifdef __DECC ! /* nothing special at the moment */ #endif /* --- 286,293 ---- * Digital's ANSI complier. */ #ifdef __DECC ! /* DEC C implies DECC$SHR, which doesn't have the %g problem of VAXCRTL */ ! #undef GFMT_WORKAROUND #endif /* diff -crN gawk-2.15.3/config/vms-posix gawk-2.15.4/config/vms-posix *** gawk-2.15.3/config/vms-posix Fri Oct 23 15:26:13 1992 --- gawk-2.15.4/config/vms-posix Tue Dec 7 23:14:30 1993 *************** *** 9,11 **** --- 9,15 ---- #define DEFAULT_FILETYPE ".awk" #define getopt gnu_getopt #define opterr gnu_opterr + #define regcomp gnu_regcomp + #define regexec gnu_regexec + #define regfree gnu_regfree + #define regerror gnu_regerror diff -crN gawk-2.15.3/config.in gawk-2.15.4/config.in *** gawk-2.15.3/config.in Mon Oct 18 09:01:45 1993 --- gawk-2.15.4/config.in Thu Jan 13 21:15:31 1994 *************** *** 5,11 **** */ /* ! * Copyright (C) 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 5,11 ---- */ /* ! * Copyright (C) 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 273,275 **** --- 273,288 ---- * getpgrp() in sysvr4 and POSIX takes no argument */ /* #define GETPGRP_NOARG 0 */ + + /* + * define const to nothing if not __STDC__ + */ + #ifndef __STDC__ + #define const + #endif + + /* If svr4 and not gcc */ + /* #define SVR4 0 */ + #ifdef SVR4 + #define __svr4__ 1 + #endif diff -crN gawk-2.15.3/dfa.c gawk-2.15.4/dfa.c *** gawk-2.15.3/dfa.c Wed Dec 31 19:00:00 1969 --- gawk-2.15.4/dfa.c Tue Jan 4 16:18:20 1994 *************** *** 0 **** --- 1,2585 ---- + /* dfa.c - deterministic extended regexp routines for GNU + Copyright (C) 1988 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + + /* Written June, 1988 by Mike Haertel + Modified July, 1988 by Arthur David Olson to assist BMG speedups */ + + #include + #include + #include + + #ifdef HAVE_CONFIG_H + #include "config.h" + #endif + + #ifdef STDC_HEADERS + #include + #else + #include + extern char *calloc(), *malloc(), *realloc(); + extern void free(); + #endif + + #if defined(HAVE_STRING_H) || defined(STDC_HEADERS) + #include + #undef index + #define index strchr + #else + #include + #endif + + #ifndef DEBUG /* use the same approach as regex.c */ + #undef assert + #define assert(e) + #endif /* DEBUG */ + + #ifndef isgraph + #define isgraph(C) (isprint(C) && !isspace(C)) + #endif + + #ifdef isascii + #define ISALPHA(C) (isascii(C) && isalpha(C)) + #define ISUPPER(C) (isascii(C) && isupper(C)) + #define ISLOWER(C) (isascii(C) && islower(C)) + #define ISDIGIT(C) (isascii(C) && isdigit(C)) + #define ISXDIGIT(C) (isascii(C) && isxdigit(C)) + #define ISSPACE(C) (isascii(C) && isspace(C)) + #define ISPUNCT(C) (isascii(C) && ispunct(C)) + #define ISALNUM(C) (isascii(C) && isalnum(C)) + #define ISPRINT(C) (isascii(C) && isprint(C)) + #define ISGRAPH(C) (isascii(C) && isgraph(C)) + #define ISCNTRL(C) (isascii(C) && iscntrl(C)) + #else + #define ISALPHA(C) isalpha(C) + #define ISUPPER(C) isupper(C) + #define ISLOWER(C) islower(C) + #define ISDIGIT(C) isdigit(C) + #define ISXDIGIT(C) isxdigit(C) + #define ISSPACE(C) isspace(C) + #define ISPUNCT(C) ispunct(C) + #define ISALNUM(C) isalnum(C) + #define ISPRINT(C) isprint(C) + #define ISGRAPH(C) isgraph(C) + #define ISCNTRL(C) iscntrl(C) + #endif + + #include "regex.h" + #include "dfa.h" + + #ifdef __STDC__ + typedef void *ptr_t; + #else + typedef char *ptr_t; + #endif + + static void dfamust _RE_ARGS((struct dfa *dfa)); + + static ptr_t xcalloc _RE_ARGS((size_t n, size_t s)); + static ptr_t xmalloc _RE_ARGS((size_t n)); + static ptr_t xrealloc _RE_ARGS((ptr_t p, size_t n)); + #ifdef DEBUG + static void prtok _RE_ARGS((token t)); + #endif + static int tstbit _RE_ARGS((int b, charclass c)); + static void setbit _RE_ARGS((int b, charclass c)); + static void clrbit _RE_ARGS((int b, charclass c)); + static void copyset _RE_ARGS((charclass src, charclass dst)); + static void zeroset _RE_ARGS((charclass s)); + static void notset _RE_ARGS((charclass s)); + static int equal _RE_ARGS((charclass s1, charclass s2)); + static int charclass_index _RE_ARGS((charclass s)); + static int looking_at _RE_ARGS((const char *s)); + static token lex _RE_ARGS((void)); + static void addtok _RE_ARGS((token t)); + static void atom _RE_ARGS((void)); + static int nsubtoks _RE_ARGS((int tindex)); + static void copytoks _RE_ARGS((int tindex, int ntokens)); + static void closure _RE_ARGS((void)); + static void branch _RE_ARGS((void)); + static void regexp _RE_ARGS((int toplevel)); + static void copy _RE_ARGS((position_set *src, position_set *dst)); + static void insert _RE_ARGS((position p, position_set *s)); + static void merge _RE_ARGS((position_set *s1, position_set *s2, position_set *m)); + static void delete _RE_ARGS((position p, position_set *s)); + static int state_index _RE_ARGS((struct dfa *d, position_set *s, + int newline, int letter)); + static void build_state _RE_ARGS((int s, struct dfa *d)); + static void build_state_zero _RE_ARGS((struct dfa *d)); + static char *icatalloc _RE_ARGS((char *old, char *new)); + static char *icpyalloc _RE_ARGS((char *string)); + static char *istrstr _RE_ARGS((char *lookin, char *lookfor)); + static void ifree _RE_ARGS((char *cp)); + static void freelist _RE_ARGS((char **cpp)); + static char **enlist _RE_ARGS((char **cpp, char *new, size_t len)); + static char **comsubs _RE_ARGS((char *left, char *right)); + static char **addlists _RE_ARGS((char **old, char **new)); + static char **inboth _RE_ARGS((char **left, char **right)); + + static ptr_t + xcalloc(n, s) + size_t n; + size_t s; + { + ptr_t r = calloc(n, s); + + if (!r) + dfaerror("Memory exhausted"); + return r; + } + + static ptr_t + xmalloc(n) + size_t n; + { + ptr_t r = malloc(n); + + assert(n != 0); + if (!r) + dfaerror("Memory exhausted"); + return r; + } + + static ptr_t + xrealloc(p, n) + ptr_t p; + size_t n; + { + ptr_t r = realloc(p, n); + + assert(n != 0); + if (!r) + dfaerror("Memory exhausted"); + return r; + } + + #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) + #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) + #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) + + /* Reallocate an array of type t if nalloc is too small for index. */ + #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ + if ((index) >= (nalloc)) \ + { \ + while ((index) >= (nalloc)) \ + (nalloc) *= 2; \ + REALLOC(p, t, nalloc); \ + } + + #ifdef DEBUG + + static void + prtok(t) + token t; + { + char *s; + + if (t < 0) + fprintf(stderr, "END"); + else if (t < NOTCHAR) + fprintf(stderr, "%c", t); + else + { + switch (t) + { + case EMPTY: s = "EMPTY"; break; + case BACKREF: s = "BACKREF"; break; + case BEGLINE: s = "BEGLINE"; break; + case ENDLINE: s = "ENDLINE"; break; + case BEGWORD: s = "BEGWORD"; break; + case ENDWORD: s = "ENDWORD"; break; + case LIMWORD: s = "LIMWORD"; break; + case NOTLIMWORD: s = "NOTLIMWORD"; break; + case QMARK: s = "QMARK"; break; + case STAR: s = "STAR"; break; + case PLUS: s = "PLUS"; break; + case CAT: s = "CAT"; break; + case OR: s = "OR"; break; + case ORTOP: s = "ORTOP"; break; + case LPAREN: s = "LPAREN"; break; + case RPAREN: s = "RPAREN"; break; + default: s = "CSET"; break; + } + fprintf(stderr, "%s", s); + } + } + #endif /* DEBUG */ + + /* Stuff pertaining to charclasses. */ + + static int + tstbit(b, c) + int b; + charclass c; + { + return c[b / INTBITS] & 1 << b % INTBITS; + } + + static void + setbit(b, c) + int b; + charclass c; + { + c[b / INTBITS] |= 1 << b % INTBITS; + } + + static void + clrbit(b, c) + int b; + charclass c; + { + c[b / INTBITS] &= ~(1 << b % INTBITS); + } + + static void + copyset(src, dst) + charclass src; + charclass dst; + { + int i; + + for (i = 0; i < CHARCLASS_INTS; ++i) + dst[i] = src[i]; + } + + static void + zeroset(s) + charclass s; + { + int i; + + for (i = 0; i < CHARCLASS_INTS; ++i) + s[i] = 0; + } + + static void + notset(s) + charclass s; + { + int i; + + for (i = 0; i < CHARCLASS_INTS; ++i) + s[i] = ~s[i]; + } + + static int + equal(s1, s2) + charclass s1; + charclass s2; + { + int i; + + for (i = 0; i < CHARCLASS_INTS; ++i) + if (s1[i] != s2[i]) + return 0; + return 1; + } + + /* A pointer to the current dfa is kept here during parsing. */ + static struct dfa *dfa; + + /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ + static int + charclass_index(s) + charclass s; + { + int i; + + for (i = 0; i < dfa->cindex; ++i) + if (equal(s, dfa->charclasses[i])) + return i; + REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex); + ++dfa->cindex; + copyset(s, dfa->charclasses[i]); + return i; + } + + /* Syntax bits controlling the behavior of the lexical analyzer. */ + static reg_syntax_t syntax_bits, syntax_bits_set; + + /* Flag for case-folding letters into sets. */ + static int case_fold; + + /* Entry point to set syntax options. */ + void + dfasyntax(bits, fold) + reg_syntax_t bits; + int fold; + { + syntax_bits_set = 1; + syntax_bits = bits; + case_fold = fold; + } + + /* Lexical analyzer. All the dross that deals with the obnoxious + GNU Regex syntax bits is located here. The poor, suffering + reader is referred to the GNU Regex documentation for the + meaning of the @#%!@#%^!@ syntax bits. */ + + static char *lexstart; /* Pointer to beginning of input string. */ + static char *lexptr; /* Pointer to next input character. */ + static lexleft; /* Number of characters remaining. */ + static token lasttok; /* Previous token returned; initially END. */ + static int laststart; /* True if we're separated from beginning or (, | + only by zero-width characters. */ + static int parens; /* Count of outstanding left parens. */ + static int minrep, maxrep; /* Repeat counts for {m,n}. */ + + /* Note that characters become unsigned here. */ + #define FETCH(c, eoferr) \ + { \ + if (! lexleft) \ + if (eoferr != 0) \ + dfaerror(eoferr); \ + else \ + return lasttok = END; \ + (c) = (unsigned char) *lexptr++; \ + --lexleft; \ + } + + #ifdef __STDC__ + #define FUNC(F, P) static int F(int c) { return P(c); } + #else + #define FUNC(F, P) static int F(c) int c; { return P(c); } + #endif + + FUNC(is_alpha, ISALPHA) + FUNC(is_upper, ISUPPER) + FUNC(is_lower, ISLOWER) + FUNC(is_digit, ISDIGIT) + FUNC(is_xdigit, ISXDIGIT) + FUNC(is_space, ISSPACE) + FUNC(is_punct, ISPUNCT) + FUNC(is_alnum, ISALNUM) + FUNC(is_print, ISPRINT) + FUNC(is_graph, ISGRAPH) + FUNC(is_cntrl, ISCNTRL) + + /* The following list maps the names of the Posix named character classes + to predicate functions that determine whether a given character is in + the class. The leading [ has already been eaten by the lexical analyzer. */ + static struct { + const char *name; + int (*pred) _RE_ARGS((int)); + } prednames[] = { + { ":alpha:]", is_alpha }, + { ":upper:]", is_upper }, + { ":lower:]", is_lower }, + { ":digit:]", is_digit }, + { ":xdigit:]", is_xdigit }, + { ":space:]", is_space }, + { ":punct:]", is_punct }, + { ":alnum:]", is_alnum }, + { ":print:]", is_print }, + { ":graph:]", is_graph }, + { ":cntrl:]", is_cntrl }, + { 0 } + }; + + static int + looking_at(s) + const char *s; + { + size_t len; + + len = strlen(s); + if (lexleft < len) + return 0; + return strncmp(s, lexptr, len) == 0; + } + + static token + lex() + { + token c, c1, c2; + int backslash = 0, invert; + charclass ccl; + int i; + + /* Basic plan: We fetch a character. If it's a backslash, + we set the backslash flag and go through the loop again. + On the plus side, this avoids having a duplicate of the + main switch inside the backslash case. On the minus side, + it means that just about every case begins with + "if (backslash) ...". */ + for (i = 0; i < 2; ++i) + { + FETCH(c, 0); + switch (c) + { + case '\\': + if (backslash) + goto normal_char; + if (lexleft == 0) + dfaerror("Unfinished \\ escape"); + backslash = 1; + break; + + case '^': + if (backslash) + goto normal_char; + if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS + || lasttok == END + || lasttok == LPAREN + || lasttok == OR) + return lasttok = BEGLINE; + goto normal_char; + + case '$': + if (backslash) + goto normal_char; + if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS + || lexleft == 0 + || (syntax_bits & RE_NO_BK_PARENS + ? lexleft > 0 && *lexptr == ')' + : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') + || (syntax_bits & RE_NO_BK_VBAR + ? lexleft > 0 && *lexptr == '|' + : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') + || ((syntax_bits & RE_NEWLINE_ALT) + && lexleft > 0 && *lexptr == '\n')) + return lasttok = ENDLINE; + goto normal_char; + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (backslash && !(syntax_bits & RE_NO_BK_REFS)) + { + laststart = 0; + return lasttok = BACKREF; + } + goto normal_char; + + case '<': + if (syntax_bits & RE_NO_GNU_OPS) + goto normal_char; + if (backslash) + return lasttok = BEGWORD; + goto normal_char; + + case '>': + if (syntax_bits & RE_NO_GNU_OPS) + goto normal_char; + if (backslash) + return lasttok = ENDWORD; + goto normal_char; + + case 'b': + if (syntax_bits & RE_NO_GNU_OPS) + goto normal_char; + if (backslash) + return lasttok = LIMWORD; + goto normal_char; + + case 'B': + if (syntax_bits & RE_NO_GNU_OPS) + goto normal_char; + if (backslash) + return lasttok = NOTLIMWORD; + goto normal_char; + + case '?': + if (syntax_bits & RE_LIMITED_OPS) + goto normal_char; + if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) + goto normal_char; + if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) + goto normal_char; + return lasttok = QMARK; + + case '*': + if (backslash) + goto normal_char; + if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) + goto normal_char; + return lasttok = STAR; + + case '+': + if (syntax_bits & RE_LIMITED_OPS) + goto normal_char; + if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) + goto normal_char; + if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) + goto normal_char; + return lasttok = PLUS; + + case '{': + if (!(syntax_bits & RE_INTERVALS)) + goto normal_char; + if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) + goto normal_char; + minrep = maxrep = 0; + /* Cases: + {M} - exact count + {M,} - minimum count, maximum is infinity + {,M} - 0 through M + {M,N} - M through N */ + FETCH(c, "unfinished repeat count"); + if (ISDIGIT(c)) + { + minrep = c - '0'; + for (;;) + { + FETCH(c, "unfinished repeat count"); + if (!ISDIGIT(c)) + break; + minrep = 10 * minrep + c - '0'; + } + } + else if (c != ',') + dfaerror("malformed repeat count"); + if (c == ',') + for (;;) + { + FETCH(c, "unfinished repeat count"); + if (!ISDIGIT(c)) + break; + maxrep = 10 * maxrep + c - '0'; + } + else + maxrep = minrep; + if (!(syntax_bits & RE_NO_BK_BRACES)) + { + if (c != '\\') + dfaerror("malformed repeat count"); + FETCH(c, "unfinished repeat count"); + } + if (c != '}') + dfaerror("malformed repeat count"); + laststart = 0; + return lasttok = REPMN; + + case '|': + if (syntax_bits & RE_LIMITED_OPS) + goto normal_char; + if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) + goto normal_char; + laststart = 1; + return lasttok = OR; + + case '\n': + if (syntax_bits & RE_LIMITED_OPS + || backslash + || !(syntax_bits & RE_NEWLINE_ALT)) + goto normal_char; + laststart = 1; + return lasttok = OR; + + case '(': + if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) + goto normal_char; + ++parens; + laststart = 1; + return lasttok = LPAREN; + + case ')': + if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) + goto normal_char; + if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_char; + --parens; + laststart = 0; + return lasttok = RPAREN; + + case '.': + if (backslash) + goto normal_char; + zeroset(ccl); + notset(ccl); + if (!(syntax_bits & RE_DOT_NEWLINE)) + clrbit('\n', ccl); + if (syntax_bits & RE_DOT_NOT_NULL) + clrbit('\0', ccl); + laststart = 0; + return lasttok = CSET + charclass_index(ccl); + + case 'w': + case 'W': + if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) + goto normal_char; + zeroset(ccl); + for (c2 = 0; c2 < NOTCHAR; ++c2) + if (ISALNUM(c2)) + setbit(c2, ccl); + if (c == 'W') + notset(ccl); + laststart = 0; + return lasttok = CSET + charclass_index(ccl); + + case '[': + if (backslash) + goto normal_char; + zeroset(ccl); + FETCH(c, "Unbalanced ["); + if (c == '^') + { + FETCH(c, "Unbalanced ["); + invert = 1; + } + else + invert = 0; + do + { + /* Nobody ever said this had to be fast. :-) + Note that if we're looking at some other [:...:] + construct, we just treat it as a bunch of ordinary + characters. We can do this because we assume + regex has checked for syntax errors before + dfa is ever called. */ + if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) + for (c1 = 0; prednames[c1].name; ++c1) + if (looking_at(prednames[c1].name)) + { + for (c2 = 0; c2 < NOTCHAR; ++c2) + if ((*prednames[c1].pred)(c2)) + setbit(c2, ccl); + lexptr += strlen(prednames[c1].name); + lexleft -= strlen(prednames[c1].name); + FETCH(c1, "Unbalanced ["); + goto skip; + } + if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) + FETCH(c, "Unbalanced ["); + FETCH(c1, "Unbalanced ["); + if (c1 == '-') + { + FETCH(c2, "Unbalanced ["); + if (c2 == ']') + { + /* In the case [x-], the - is an ordinary hyphen, + which is left in c1, the lookahead character. */ + --lexptr; + ++lexleft; + c2 = c; + } + else + { + if (c2 == '\\' + && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) + FETCH(c2, "Unbalanced ["); + FETCH(c1, "Unbalanced ["); + } + } + else + c2 = c; + while (c <= c2) + { + setbit(c, ccl); + if (case_fold) + if (ISUPPER(c)) + setbit(tolower(c), ccl); + else if (ISLOWER(c)) + setbit(toupper(c), ccl); + ++c; + } + skip: + ; + } + while ((c = c1) != ']'); + if (invert) + { + notset(ccl); + if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) + clrbit('\n', ccl); + } + laststart = 0; + return lasttok = CSET + charclass_index(ccl); + + default: + normal_char: + laststart = 0; + if (case_fold && ISALPHA(c)) + { + zeroset(ccl); + setbit(c, ccl); + if (isupper(c)) + setbit(tolower(c), ccl); + else + setbit(toupper(c), ccl); + return lasttok = CSET + charclass_index(ccl); + } + return c; + } + } + + /* The above loop should consume at most a backslash + and some other character. */ + abort(); + } + + /* Recursive descent parser for regular expressions. */ + + static token tok; /* Lookahead token. */ + static depth; /* Current depth of a hypothetical stack + holding deferred productions. This is + used to determine the depth that will be + required of the real stack later on in + dfaanalyze(). */ + + /* Add the given token to the parse tree, maintaining the depth count and + updating the maximum depth if necessary. */ + static void + addtok(t) + token t; + { + REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); + dfa->tokens[dfa->tindex++] = t; + + switch (t) + { + case QMARK: + case STAR: + case PLUS: + break; + + case CAT: + case OR: + case ORTOP: + --depth; + break; + + default: + ++dfa->nleaves; + case EMPTY: + ++depth; + break; + } + if (depth > dfa->depth) + dfa->depth = depth; + } + + /* The grammar understood by the parser is as follows. + + regexp: + regexp OR branch + branch + + branch: + branch closure + closure + + closure: + closure QMARK + closure STAR + closure PLUS + atom + + atom: + + CSET + BACKREF + BEGLINE + ENDLINE + BEGWORD + ENDWORD + LIMWORD + NOTLIMWORD + + + The parser builds a parse tree in postfix form in an array of tokens. */ + + static void + atom() + { + if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF + || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD + || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) + { + addtok(tok); + tok = lex(); + } + else if (tok == LPAREN) + { + tok = lex(); + regexp(0); + if (tok != RPAREN) + dfaerror("Unbalanced ("); + tok = lex(); + } + else + addtok(EMPTY); + } + + /* Return the number of tokens in the given subexpression. */ + static int + nsubtoks(tindex) + int tindex; + { + int ntoks1; + + switch (dfa->tokens[tindex - 1]) + { + default: + return 1; + case QMARK: + case STAR: + case PLUS: + return 1 + nsubtoks(tindex - 1); + case CAT: + case OR: + case ORTOP: + ntoks1 = nsubtoks(tindex - 1); + return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); + } + } + + /* Copy the given subexpression to the top of the tree. */ + static void + copytoks(tindex, ntokens) + int tindex, ntokens; + { + int i; + + for (i = 0; i < ntokens; ++i) + addtok(dfa->tokens[tindex + i]); + } + + static void + closure() + { + int tindex, ntokens, i; + + atom(); + while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) + if (tok == REPMN) + { + ntokens = nsubtoks(dfa->tindex); + tindex = dfa->tindex - ntokens; + if (maxrep == 0) + addtok(PLUS); + if (minrep == 0) + addtok(QMARK); + for (i = 1; i < minrep; ++i) + { + copytoks(tindex, ntokens); + addtok(CAT); + } + for (; i < maxrep; ++i) + { + copytoks(tindex, ntokens); + addtok(QMARK); + addtok(CAT); + } + tok = lex(); + } + else + { + addtok(tok); + tok = lex(); + } + } + + static void + branch() + { + closure(); + while (tok != RPAREN && tok != OR && tok >= 0) + { + closure(); + addtok(CAT); + } + } + + static void + regexp(toplevel) + int toplevel; + { + branch(); + while (tok == OR) + { + tok = lex(); + branch(); + if (toplevel) + addtok(ORTOP); + else + addtok(OR); + } + } + + /* Main entry point for the parser. S is a string to be parsed, len is the + length of the string, so s can include NUL characters. D is a pointer to + the struct dfa to parse into. */ + void + dfaparse(s, len, d) + char *s; + size_t len; + struct dfa *d; + + { + dfa = d; + lexstart = lexptr = s; + lexleft = len; + lasttok = END; + laststart = 1; + parens = 0; + + if (! syntax_bits_set) + dfaerror("No syntax specified"); + + tok = lex(); + depth = d->depth; + + regexp(1); + + if (tok != END) + dfaerror("Unbalanced )"); + + addtok(END - d->nregexps); + addtok(CAT); + + if (d->nregexps) + addtok(ORTOP); + + ++d->nregexps; + } + + /* Some primitives for operating on sets of positions. */ + + /* Copy one set to another; the destination must be large enough. */ + static void + copy(src, dst) + position_set *src; + position_set *dst; + { + int i; + + for (i = 0; i < src->nelem; ++i) + dst->elems[i] = src->elems[i]; + dst->nelem = src->nelem; + } + + /* Insert a position in a set. Position sets are maintained in sorted + order according to index. If position already exists in the set with + the same index then their constraints are logically or'd together. + S->elems must point to an array large enough to hold the resulting set. */ + static void + insert(p, s) + position p; + position_set *s; + { + int i; + position t1, t2; + + for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) + ; + if (i < s->nelem && p.index == s->elems[i].index) + s->elems[i].constraint |= p.constraint; + else + { + t1 = p; + ++s->nelem; + while (i < s->nelem) + { + t2 = s->elems[i]; + s->elems[i++] = t1; + t1 = t2; + } + } + } + + /* Merge two sets of positions into a third. The result is exactly as if + the positions of both sets were inserted into an initially empty set. */ + static void + merge(s1, s2, m) + position_set *s1; + position_set *s2; + position_set *m; + { + int i = 0, j = 0; + + m->nelem = 0; + while (i < s1->nelem && j < s2->nelem) + if (s1->elems[i].index > s2->elems[j].index) + m->elems[m->nelem++] = s1->elems[i++]; + else if (s1->elems[i].index < s2->elems[j].index) + m->elems[m->nelem++] = s2->elems[j++]; + else + { + m->elems[m->nelem] = s1->elems[i++]; + m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; + } + while (i < s1->nelem) + m->elems[m->nelem++] = s1->elems[i++]; + while (j < s2->nelem) + m->elems[m->nelem++] = s2->elems[j++]; + } + + /* Delete a position from a set. */ + static void + delete(p, s) + position p; + position_set *s; + { + int i; + + for (i = 0; i < s->nelem; ++i) + if (p.index == s->elems[i].index) + break; + if (i < s->nelem) + for (--s->nelem; i < s->nelem; ++i) + s->elems[i] = s->elems[i + 1]; + } + + /* Find the index of the state corresponding to the given position set with + the given preceding context, or create a new state if there is no such + state. Newline and letter tell whether we got here on a newline or + letter, respectively. */ + static int + state_index(d, s, newline, letter) + struct dfa *d; + position_set *s; + int newline; + int letter; + { + int hash = 0; + int constraint; + int i, j; + + newline = newline ? 1 : 0; + letter = letter ? 1 : 0; + + for (i = 0; i < s->nelem; ++i) + hash ^= s->elems[i].index + s->elems[i].constraint; + + /* Try to find a state that exactly matches the proposed one. */ + for (i = 0; i < d->sindex; ++i) + { + if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem + || newline != d->states[i].newline || letter != d->states[i].letter) + continue; + for (j = 0; j < s->nelem; ++j) + if (s->elems[j].constraint + != d->states[i].elems.elems[j].constraint + || s->elems[j].index != d->states[i].elems.elems[j].index) + break; + if (j == s->nelem) + return i; + } + + /* We'll have to create a new state. */ + REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); + d->states[i].hash = hash; + MALLOC(d->states[i].elems.elems, position, s->nelem); + copy(s, &d->states[i].elems); + d->states[i].newline = newline; + d->states[i].letter = letter; + d->states[i].backref = 0; + d->states[i].constraint = 0; + d->states[i].first_end = 0; + for (j = 0; j < s->nelem; ++j) + if (d->tokens[s->elems[j].index] < 0) + { + constraint = s->elems[j].constraint; + if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) + || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) + || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) + || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) + d->states[i].constraint |= constraint; + if (! d->states[i].first_end) + d->states[i].first_end = d->tokens[s->elems[j].index]; + } + else if (d->tokens[s->elems[j].index] == BACKREF) + { + d->states[i].constraint = NO_CONSTRAINT; + d->states[i].backref = 1; + } + + ++d->sindex; + + return i; + } + + /* Find the epsilon closure of a set of positions. If any position of the set + contains a symbol that matches the empty string in some context, replace + that position with the elements of its follow labeled with an appropriate + constraint. Repeat exhaustively until no funny positions are left. + S->elems must be large enough to hold the result. */ + static void epsclosure _RE_ARGS((position_set *s, struct dfa *d)); + + static void + epsclosure(s, d) + position_set *s; + struct dfa *d; + { + int i, j; + int *visited; + position p, old; + + MALLOC(visited, int, d->tindex); + for (i = 0; i < d->tindex; ++i) + visited[i] = 0; + + for (i = 0; i < s->nelem; ++i) + if (d->tokens[s->elems[i].index] >= NOTCHAR + && d->tokens[s->elems[i].index] != BACKREF + && d->tokens[s->elems[i].index] < CSET) + { + old = s->elems[i]; + p.constraint = old.constraint; + delete(s->elems[i], s); + if (visited[old.index]) + { + --i; + continue; + } + visited[old.index] = 1; + switch (d->tokens[old.index]) + { + case BEGLINE: + p.constraint &= BEGLINE_CONSTRAINT; + break; + case ENDLINE: + p.constraint &= ENDLINE_CONSTRAINT; + break; + case BEGWORD: + p.constraint &= BEGWORD_CONSTRAINT; + break; + case ENDWORD: + p.constraint &= ENDWORD_CONSTRAINT; + break; + case LIMWORD: + p.constraint &= LIMWORD_CONSTRAINT; + break; + case NOTLIMWORD: + p.constraint &= NOTLIMWORD_CONSTRAINT; + break; + default: + break; + } + for (j = 0; j < d->follows[old.index].nelem; ++j) + { + p.index = d->follows[old.index].elems[j].index; + insert(p, s); + } + /* Force rescan to start at the beginning. */ + i = -1; + } + + free(visited); + } + + /* Perform bottom-up analysis on the parse tree, computing various functions. + Note that at this point, we're pretending constructs like \< are real + characters rather than constraints on what can follow them. + + Nullable: A node is nullable if it is at the root of a regexp that can + match the empty string. + * EMPTY leaves are nullable. + * No other leaf is nullable. + * A QMARK or STAR node is nullable. + * A PLUS node is nullable if its argument is nullable. + * A CAT node is nullable if both its arguments are nullable. + * An OR node is nullable if either argument is nullable. + + Firstpos: The firstpos of a node is the set of positions (nonempty leaves) + that could correspond to the first character of a string matching the + regexp rooted at the given node. + * EMPTY leaves have empty firstpos. + * The firstpos of a nonempty leaf is that leaf itself. + * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its + argument. + * The firstpos of a CAT node is the firstpos of the left argument, union + the firstpos of the right if the left argument is nullable. + * The firstpos of an OR node is the union of firstpos of each argument. + + Lastpos: The lastpos of a node is the set of positions that could + correspond to the last character of a string matching the regexp at + the given node. + * EMPTY leaves have empty lastpos. + * The lastpos of a nonempty leaf is that leaf itself. + * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its + argument. + * The lastpos of a CAT node is the lastpos of its right argument, union + the lastpos of the left if the right argument is nullable. + * The lastpos of an OR node is the union of the lastpos of each argument. + + Follow: The follow of a position is the set of positions that could + correspond to the character following a character matching the node in + a string matching the regexp. At this point we consider special symbols + that match the empty string in some context to be just normal characters. + Later, if we find that a special symbol is in a follow set, we will + replace it with the elements of its follow, labeled with an appropriate + constraint. + * Every node in the firstpos of the argument of a STAR or PLUS node is in + the follow of every node in the lastpos. + * Every node in the firstpos of the second argument of a CAT node is in + the follow of every node in the lastpos of the first argument. + + Because of the postfix representation of the parse tree, the depth-first + analysis is conveniently done by a linear scan with the aid of a stack. + Sets are stored as arrays of the elements, obeying a stack-like allocation + scheme; the number of elements in each set deeper in the stack can be + used to determine the address of a particular set's array. */ + void + dfaanalyze(d, searchflag) + struct dfa *d; + int searchflag; + { + int *nullable; /* Nullable stack. */ + int *nfirstpos; /* Element count stack for firstpos sets. */ + position *firstpos; /* Array where firstpos elements are stored. */ + int *nlastpos; /* Element count stack for lastpos sets. */ + position *lastpos; /* Array where lastpos elements are stored. */ + int *nalloc; /* Sizes of arrays allocated to follow sets. */ + position_set tmp; /* Temporary set for merging sets. */ + position_set merged; /* Result of merging sets. */ + int wants_newline; /* True if some position wants newline info. */ + int *o_nullable; + int *o_nfirst, *o_nlast; + position *o_firstpos, *o_lastpos; + int i, j; + position *pos; + + #ifdef DEBUG + fprintf(stderr, "dfaanalyze:\n"); + for (i = 0; i < d->tindex; ++i) + { + fprintf(stderr, " %d:", i); + prtok(d->tokens[i]); + } + putc('\n', stderr); + #endif + + d->searchflag = searchflag; + + MALLOC(nullable, int, d->depth); + o_nullable = nullable; + MALLOC(nfirstpos, int, d->depth); + o_nfirst = nfirstpos; + MALLOC(firstpos, position, d->nleaves); + o_firstpos = firstpos, firstpos += d->nleaves; + MALLOC(nlastpos, int, d->depth); + o_nlast = nlastpos; + MALLOC(lastpos, position, d->nleaves); + o_lastpos = lastpos, lastpos += d->nleaves; + MALLOC(nalloc, int, d->tindex); + for (i = 0; i < d->tindex; ++i) + nalloc[i] = 0; + MALLOC(merged.elems, position, d->nleaves); + + CALLOC(d->follows, position_set, d->tindex); + + for (i = 0; i < d->tindex; ++i) + #ifdef DEBUG + { /* Nonsyntactic #ifdef goo... */ + #endif + switch (d->tokens[i]) + { + case EMPTY: + /* The empty set is nullable. */ + *nullable++ = 1; + + /* The firstpos and lastpos of the empty leaf are both empty. */ + *nfirstpos++ = *nlastpos++ = 0; + break; + + case STAR: + case PLUS: + /* Every element in the firstpos of the argument is in the follow + of every element in the lastpos. */ + tmp.nelem = nfirstpos[-1]; + tmp.elems = firstpos; + pos = lastpos; + for (j = 0; j < nlastpos[-1]; ++j) + { + merge(&tmp, &d->follows[pos[j].index], &merged); + REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, + nalloc[pos[j].index], merged.nelem - 1); + copy(&merged, &d->follows[pos[j].index]); + } + + case QMARK: + /* A QMARK or STAR node is automatically nullable. */ + if (d->tokens[i] != PLUS) + nullable[-1] = 1; + break; + + case CAT: + /* Every element in the firstpos of the second argument is in the + follow of every element in the lastpos of the first argument. */ + tmp.nelem = nfirstpos[-1]; + tmp.elems = firstpos; + pos = lastpos + nlastpos[-1]; + for (j = 0; j < nlastpos[-2]; ++j) + { + merge(&tmp, &d->follows[pos[j].index], &merged); + REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, + nalloc[pos[j].index], merged.nelem - 1); + copy(&merged, &d->follows[pos[j].index]); + } + + /* The firstpos of a CAT node is the firstpos of the first argument, + union that of the second argument if the first is nullable. */ + if (nullable[-2]) + nfirstpos[-2] += nfirstpos[-1]; + else + firstpos += nfirstpos[-1]; + --nfirstpos; + + /* The lastpos of a CAT node is the lastpos of the second argument, + union that of the first argument if the second is nullable. */ + if (nullable[-1]) + nlastpos[-2] += nlastpos[-1]; + else + { + pos = lastpos + nlastpos[-2]; + for (j = nlastpos[-1] - 1; j >= 0; --j) + pos[j] = lastpos[j]; + lastpos += nlastpos[-2]; + nlastpos[-2] = nlastpos[-1]; + } + --nlastpos; + + /* A CAT node is nullable if both arguments are nullable. */ + nullable[-2] = nullable[-1] && nullable[-2]; + --nullable; + break; + + case OR: + case ORTOP: + /* The firstpos is the union of the firstpos of each argument. */ + nfirstpos[-2] += nfirstpos[-1]; + --nfirstpos; + + /* The lastpos is the union of the lastpos of each argument. */ + nlastpos[-2] += nlastpos[-1]; + --nlastpos; + + /* An OR node is nullable if either argument is nullable. */ + nullable[-2] = nullable[-1] || nullable[-2]; + --nullable; + break; + + default: + /* Anything else is a nonempty position. (Note that special + constructs like \< are treated as nonempty strings here; + an "epsilon closure" effectively makes them nullable later. + Backreferences have to get a real position so we can detect + transitions on them later. But they are nullable. */ + *nullable++ = d->tokens[i] == BACKREF; + + /* This position is in its own firstpos and lastpos. */ + *nfirstpos++ = *nlastpos++ = 1; + --firstpos, --lastpos; + firstpos->index = lastpos->index = i; + firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; + + /* Allocate the follow set for this position. */ + nalloc[i] = 1; + MALLOC(d->follows[i].elems, position, nalloc[i]); + break; + } + #ifdef DEBUG + /* ... balance the above nonsyntactic #ifdef goo... */ + fprintf(stderr, "node %d:", i); + prtok(d->tokens[i]); + putc('\n', stderr); + fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); + fprintf(stderr, " firstpos:"); + for (j = nfirstpos[-1] - 1; j >= 0; --j) + { + fprintf(stderr, " %d:", firstpos[j].index); + prtok(d->tokens[firstpos[j].index]); + } + fprintf(stderr, "\n lastpos:"); + for (j = nlastpos[-1] - 1; j >= 0; --j) + { + fprintf(stderr, " %d:", lastpos[j].index); + prtok(d->tokens[lastpos[j].index]); + } + putc('\n', stderr); + } + #endif + + /* For each follow set that is the follow set of a real position, replace + it with its epsilon closure. */ + for (i = 0; i < d->tindex; ++i) + if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF + || d->tokens[i] >= CSET) + { + #ifdef DEBUG + fprintf(stderr, "follows(%d:", i); + prtok(d->tokens[i]); + fprintf(stderr, "):"); + for (j = d->follows[i].nelem - 1; j >= 0; --j) + { + fprintf(stderr, " %d:", d->follows[i].elems[j].index); + prtok(d->tokens[d->follows[i].elems[j].index]); + } + putc('\n', stderr); + #endif + copy(&d->follows[i], &merged); + epsclosure(&merged, d); + if (d->follows[i].nelem < merged.nelem) + REALLOC(d->follows[i].elems, position, merged.nelem); + copy(&merged, &d->follows[i]); + } + + /* Get the epsilon closure of the firstpos of the regexp. The result will + be the set of positions of state 0. */ + merged.nelem = 0; + for (i = 0; i < nfirstpos[-1]; ++i) + insert(firstpos[i], &merged); + epsclosure(&merged, d); + + /* Check if any of the positions of state 0 will want newline context. */ + wants_newline = 0; + for (i = 0; i < merged.nelem; ++i) + if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) + wants_newline = 1; + + /* Build the initial state. */ + d->salloc = 1; + d->sindex = 0; + MALLOC(d->states, dfa_state, d->salloc); + state_index(d, &merged, wants_newline, 0); + + free(o_nullable); + free(o_nfirst); + free(o_firstpos); + free(o_nlast); + free(o_lastpos); + free(nalloc); + free(merged.elems); + } + + /* Find, for each character, the transition out of state s of d, and store + it in the appropriate slot of trans. + + We divide the positions of s into groups (positions can appear in more + than one group). Each group is labeled with a set of characters that + every position in the group matches (taking into account, if necessary, + preceding context information of s). For each group, find the union + of the its elements' follows. This set is the set of positions of the + new state. For each character in the group's label, set the transition + on this character to be to a state corresponding to the set's positions, + and its associated backward context information, if necessary. + + If we are building a searching matcher, we include the positions of state + 0 in every state. + + The collection of groups is constructed by building an equivalence-class + partition of the positions of s. + + For each position, find the set of characters C that it matches. Eliminate + any characters from C that fail on grounds of backward context. + + Search through the groups, looking for a group whose label L has nonempty + intersection with C. If L - C is nonempty, create a new group labeled + L - C and having the same positions as the current group, and set L to + the intersection of L and C. Insert the position in this group, set + C = C - L, and resume scanning. + + If after comparing with every group there are characters remaining in C, + create a new group labeled with the characters of C and insert this + position in that group. */ + void + dfastate(s, d, trans) + int s; + struct dfa *d; + int trans[]; + { + position_set grps[NOTCHAR]; /* As many as will ever be needed. */ + charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ + int ngrps = 0; /* Number of groups actually used. */ + position pos; /* Current position being considered. */ + charclass matches; /* Set of matching characters. */ + int matchesf; /* True if matches is nonempty. */ + charclass intersect; /* Intersection with some label set. */ + int intersectf; /* True if intersect is nonempty. */ + charclass leftovers; /* Stuff in the label that didn't match. */ + int leftoversf; /* True if leftovers is nonempty. */ + static charclass letters; /* Set of characters considered letters. */ + static charclass newline; /* Set of characters that aren't newline. */ + position_set follows; /* Union of the follows of some group. */ + position_set tmp; /* Temporary space for merging sets. */ + int state; /* New state. */ + int wants_newline; /* New state wants to know newline context. */ + int state_newline; /* New state on a newline transition. */ + int wants_letter; /* New state wants to know letter context. */ + int state_letter; /* New state on a letter transition. */ + static initialized; /* Flag for static initialization. */ + int i, j, k; + + /* Initialize the set of letters, if necessary. */ + if (! initialized) + { + initialized = 1; + for (i = 0; i < NOTCHAR; ++i) + if (ISALNUM(i)) + setbit(i, letters); + setbit('\n', newline); + } + + zeroset(matches); + + for (i = 0; i < d->states[s].elems.nelem; ++i) + { + pos = d->states[s].elems.elems[i]; + if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) + setbit(d->tokens[pos.index], matches); + else if (d->tokens[pos.index] >= CSET) + copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); + else + continue; + + /* Some characters may need to be eliminated from matches because + they fail in the current context. */ + if (pos.constraint != 0xFF) + { + if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, + d->states[s].newline, 1)) + clrbit('\n', matches); + if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, + d->states[s].newline, 0)) + for (j = 0; j < CHARCLASS_INTS; ++j) + matches[j] &= newline[j]; + if (! MATCHES_LETTER_CONTEXT(pos.constraint, + d->states[s].letter, 1)) + for (j = 0; j < CHARCLASS_INTS; ++j) + matches[j] &= ~letters[j]; + if (! MATCHES_LETTER_CONTEXT(pos.constraint, + d->states[s].letter, 0)) + for (j = 0; j < CHARCLASS_INTS; ++j) + matches[j] &= letters[j]; + + /* If there are no characters left, there's no point in going on. */ + for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) + ; + if (j == CHARCLASS_INTS) + continue; + } + + for (j = 0; j < ngrps; ++j) + { + /* If matches contains a single character only, and the current + group's label doesn't contain that character, go on to the + next group. */ + if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR + && !tstbit(d->tokens[pos.index], labels[j])) + continue; + + /* Check if this group's label has a nonempty intersection with + matches. */ + intersectf = 0; + for (k = 0; k < CHARCLASS_INTS; ++k) + (intersect[k] = matches[k] & labels[j][k]) ? intersectf = 1 : 0; + if (! intersectf) + continue; + + /* It does; now find the set differences both ways. */ + leftoversf = matchesf = 0; + for (k = 0; k < CHARCLASS_INTS; ++k) + { + /* Even an optimizing compiler can't know this for sure. */ + int match = matches[k], label = labels[j][k]; + + (leftovers[k] = ~match & label) ? leftoversf = 1 : 0; + (matches[k] = match & ~label) ? matchesf = 1 : 0; + } + + /* If there were leftovers, create a new group labeled with them. */ + if (leftoversf) + { + copyset(leftovers, labels[ngrps]); + copyset(intersect, labels[j]); + MALLOC(grps[ngrps].elems, position, d->nleaves); + copy(&grps[j], &grps[ngrps]); + ++ngrps; + } + + /* Put the position in the current group. Note that there is no + reason to call insert() here. */ + grps[j].elems[grps[j].nelem++] = pos; + + /* If every character matching the current position has been + accounted for, we're done. */ + if (! matchesf) + break; + } + + /* If we've passed the last group, and there are still characters + unaccounted for, then we'll have to create a new group. */ + if (j == ngrps) + { + copyset(matches, labels[ngrps]); + zeroset(matches); + MALLOC(grps[ngrps].elems, position, d->nleaves); + grps[ngrps].nelem = 1; + grps[ngrps].elems[0] = pos; + ++ngrps; + } + } + + MALLOC(follows.elems, position, d->nleaves); + MALLOC(tmp.elems, position, d->nleaves); + + /* If we are a searching matcher, the default transition is to a state + containing the positions of state 0, otherwise the default transition + is to fail miserably. */ + if (d->searchflag) + { + wants_newline = 0; + wants_letter = 0; + for (i = 0; i < d->states[0].elems.nelem; ++i) + { + if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint)) + wants_newline = 1; + if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint)) + wants_letter = 1; + } + copy(&d->states[0].elems, &follows); + state = state_index(d, &follows, 0, 0); + if (wants_newline) + state_newline = state_index(d, &follows, 1, 0); + else + state_newline = state; + if (wants_letter) + state_letter = state_index(d, &follows, 0, 1); + else + state_letter = state; + for (i = 0; i < NOTCHAR; ++i) + if (i == '\n') + trans[i] = state_newline; + else if (ISALNUM(i)) + trans[i] = state_letter; + else + trans[i] = state; + } + else + for (i = 0; i < NOTCHAR; ++i) + trans[i] = -1; + + for (i = 0; i < ngrps; ++i) + { + follows.nelem = 0; + + /* Find the union of the follows of the positions of the group. + This is a hideously inefficient loop. Fix it someday. */ + for (j = 0; j < grps[i].nelem; ++j) + for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) + insert(d->follows[grps[i].elems[j].index].elems[k], &follows); + + /* If we are building a searching matcher, throw in the positions + of state 0 as well. */ + if (d->searchflag) + for (j = 0; j < d->states[0].elems.nelem; ++j) + insert(d->states[0].elems.elems[j], &follows); + + /* Find out if the new state will want any context information. */ + wants_newline = 0; + if (tstbit('\n', labels[i])) + for (j = 0; j < follows.nelem; ++j) + if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) + wants_newline = 1; + + wants_letter = 0; + for (j = 0; j < CHARCLASS_INTS; ++j) + if (labels[i][j] & letters[j]) + break; + if (j < CHARCLASS_INTS) + for (j = 0; j < follows.nelem; ++j) + if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) + wants_letter = 1; + + /* Find the state(s) corresponding to the union of the follows. */ + state = state_index(d, &follows, 0, 0); + if (wants_newline) + state_newline = state_index(d, &follows, 1, 0); + else + state_newline = state; + if (wants_letter) + state_letter = state_index(d, &follows, 0, 1); + else + state_letter = state; + + /* Set the transitions for each character in the current label. */ + for (j = 0; j < CHARCLASS_INTS; ++j) + for (k = 0; k < INTBITS; ++k) + if (labels[i][j] & 1 << k) + { + int c = j * INTBITS + k; + + if (c == '\n') + trans[c] = state_newline; + else if (ISALNUM(c)) + trans[c] = state_letter; + else if (c < NOTCHAR) + trans[c] = state; + } + } + + for (i = 0; i < ngrps; ++i) + free(grps[i].elems); + free(follows.elems); + free(tmp.elems); + } + + /* Some routines for manipulating a compiled dfa's transition tables. + Each state may or may not have a transition table; if it does, and it + is a non-accepting state, then d->trans[state] points to its table. + If it is an accepting state then d->fails[state] points to its table. + If it has no table at all, then d->trans[state] is NULL. + TODO: Improve this comment, get rid of the unnecessary redundancy. */ + + static void + build_state(s, d) + int s; + struct dfa *d; + { + int *trans; /* The new transition table. */ + int i; + + /* Set an upper limit on the number of transition tables that will ever + exist at once. 1024 is arbitrary. The idea is that the frequently + used transition tables will be quickly rebuilt, whereas the ones that + were only needed once or twice will be cleared away. */ + if (d->trcount >= 1024) + { + for (i = 0; i < d->tralloc; ++i) + if (d->trans[i]) + { + free((ptr_t) d->trans[i]); + d->trans[i] = NULL; + } + else if (d->fails[i]) + { + free((ptr_t) d->fails[i]); + d->fails[i] = NULL; + } + d->trcount = 0; + } + + ++d->trcount; + + /* Set up the success bits for this state. */ + d->success[s] = 0; + if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0, + s, *d)) + d->success[s] |= 4; + if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1, + s, *d)) + d->success[s] |= 2; + if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0, + s, *d)) + d->success[s] |= 1; + + MALLOC(trans, int, NOTCHAR); + dfastate(s, d, trans); + + /* Now go through the new transition table, and make sure that the trans + and fail arrays are allocated large enough to hold a pointer for the + largest state mentioned in the table. */ + for (i = 0; i < NOTCHAR; ++i) + if (trans[i] >= d->tralloc) + { + int oldalloc = d->tralloc; + + while (trans[i] >= d->tralloc) + d->tralloc *= 2; + REALLOC(d->realtrans, int *, d->tralloc + 1); + d->trans = d->realtrans + 1; + REALLOC(d->fails, int *, d->tralloc); + REALLOC(d->success, int, d->tralloc); + REALLOC(d->newlines, int, d->tralloc); + while (oldalloc < d->tralloc) + { + d->trans[oldalloc] = NULL; + d->fails[oldalloc++] = NULL; + } + } + + /* Keep the newline transition in a special place so we can use it as + a sentinel. */ + d->newlines[s] = trans['\n']; + trans['\n'] = -1; + + if (ACCEPTING(s, *d)) + d->fails[s] = trans; + else + d->trans[s] = trans; + } + + static void + build_state_zero(d) + struct dfa *d; + { + d->tralloc = 1; + d->trcount = 0; + CALLOC(d->realtrans, int *, d->tralloc + 1); + d->trans = d->realtrans + 1; + CALLOC(d->fails, int *, d->tralloc); + MALLOC(d->success, int, d->tralloc); + MALLOC(d->newlines, int, d->tralloc); + build_state(0, d); + } + + /* Search through a buffer looking for a match to the given struct dfa. + Find the first occurrence of a string matching the regexp in the buffer, + and the shortest possible version thereof. Return a pointer to the first + character after the match, or NULL if none is found. Begin points to + the beginning of the buffer, and end points to the first character after + its end. We store a newline in *end to act as a sentinel, so end had + better point somewhere valid. Newline is a flag indicating whether to + allow newlines to be in the matching string. If count is non- + NULL it points to a place we're supposed to increment every time we + see a newline. Finally, if backref is non-NULL it points to a place + where we're supposed to store a 1 if backreferencing happened and the + match needs to be verified by a backtracking matcher. Otherwise + we store a 0 in *backref. */ + char * + dfaexec(d, begin, end, newline, count, backref) + struct dfa *d; + char *begin; + char *end; + int newline; + int *count; + int *backref; + { + register s, s1, tmp; /* Current state. */ + register unsigned char *p; /* Current input character. */ + register **trans, *t; /* Copy of d->trans so it can be optimized + into a register. */ + static sbit[NOTCHAR]; /* Table for anding with d->success. */ + static sbit_init; + + if (! sbit_init) + { + int i; + + sbit_init = 1; + for (i = 0; i < NOTCHAR; ++i) + if (i == '\n') + sbit[i] = 4; + else if (ISALNUM(i)) + sbit[i] = 2; + else + sbit[i] = 1; + } + + if (! d->tralloc) + build_state_zero(d); + + s = s1 = 0; + p = (unsigned char *) begin; + trans = d->trans; + *end = '\n'; + + for (;;) + { + /* The dreaded inner loop. */ + if ((t = trans[s]) != 0) + do + { + s1 = t[*p++]; + if (! (t = trans[s1])) + goto last_was_s; + s = t[*p++]; + } + while ((t = trans[s]) != 0); + goto last_was_s1; + last_was_s: + tmp = s, s = s1, s1 = tmp; + last_was_s1: + + if (s >= 0 && p <= (unsigned char *) end && d->fails[s]) + { + if (d->success[s] & sbit[*p]) + { + if (backref) + if (d->states[s].backref) + *backref = 1; + else + *backref = 0; + return (char *) p; + } + + s1 = s; + s = d->fails[s][*p++]; + continue; + } + + /* If the previous character was a newline, count it. */ + if (count && (char *) p <= end && p[-1] == '\n') + ++*count; + + /* Check if we've run off the end of the buffer. */ + if ((char *) p > end) + return NULL; + + if (s >= 0) + { + build_state(s, d); + trans = d->trans; + continue; + } + + if (p[-1] == '\n' && newline) + { + s = d->newlines[s1]; + continue; + } + + s = 0; + } + } + + /* Initialize the components of a dfa that the other routines don't + initialize for themselves. */ + void + dfainit(d) + struct dfa *d; + { + d->calloc = 1; + MALLOC(d->charclasses, charclass, d->calloc); + d->cindex = 0; + + d->talloc = 1; + MALLOC(d->tokens, token, d->talloc); + d->tindex = d->depth = d->nleaves = d->nregexps = 0; + + d->searchflag = 0; + d->tralloc = 0; + + d->musts = 0; + } + + /* Parse and analyze a single string of the given length. */ + void + dfacomp(s, len, d, searchflag) + char *s; + size_t len; + struct dfa *d; + int searchflag; + { + if (case_fold) /* dummy folding in service of dfamust() */ + { + char *lcopy; + int i; + + lcopy = malloc(len); + if (!lcopy) + dfaerror("out of memory"); + + /* This is a kludge. */ + case_fold = 0; + for (i = 0; i < len; ++i) + if (ISUPPER(s[i])) + lcopy[i] = tolower(s[i]); + else + lcopy[i] = s[i]; + + dfainit(d); + dfaparse(lcopy, len, d); + free(lcopy); + dfamust(d); + d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; + case_fold = 1; + dfaparse(s, len, d); + dfaanalyze(d, searchflag); + } + else + { + dfainit(d); + dfaparse(s, len, d); + dfamust(d); + dfaanalyze(d, searchflag); + } + } + + /* Free the storage held by the components of a dfa. */ + void + dfafree(d) + struct dfa *d; + { + int i; + struct dfamust *dm, *ndm; + + free((ptr_t) d->charclasses); + free((ptr_t) d->tokens); + for (i = 0; i < d->sindex; ++i) + free((ptr_t) d->states[i].elems.elems); + free((ptr_t) d->states); + for (i = 0; i < d->tindex; ++i) + if (d->follows[i].elems) + free((ptr_t) d->follows[i].elems); + free((ptr_t) d->follows); + for (i = 0; i < d->tralloc; ++i) + if (d->trans[i]) + free((ptr_t) d->trans[i]); + else if (d->fails[i]) + free((ptr_t) d->fails[i]); + free((ptr_t) d->realtrans); + free((ptr_t) d->fails); + free((ptr_t) d->newlines); + for (dm = d->musts; dm; dm = ndm) + { + ndm = dm->next; + free(dm->must); + free((ptr_t) dm); + } + } + + /* Having found the postfix representation of the regular expression, + try to find a long sequence of characters that must appear in any line + containing the r.e. + Finding a "longest" sequence is beyond the scope here; + we take an easy way out and hope for the best. + (Take "(ab|a)b"--please.) + + We do a bottom-up calculation of sequences of characters that must appear + in matches of r.e.'s represented by trees rooted at the nodes of the postfix + representation: + sequences that must appear at the left of the match ("left") + sequences that must appear at the right of the match ("right") + lists of sequences that must appear somewhere in the match ("in") + sequences that must constitute the match ("is") + + When we get to the root of the tree, we use one of the longest of its + calculated "in" sequences as our answer. The sequence we find is returned in + d->must (where "d" is the single argument passed to "dfamust"); + the length of the sequence is returned in d->mustn. + + The sequences calculated for the various types of node (in pseudo ANSI c) + are shown below. "p" is the operand of unary operators (and the left-hand + operand of binary operators); "q" is the right-hand operand of binary + operators. + + "ZERO" means "a zero-length sequence" below. + + Type left right is in + ---- ---- ----- -- -- + char c # c # c # c # c + + CSET ZERO ZERO ZERO ZERO + + STAR ZERO ZERO ZERO ZERO + + QMARK ZERO ZERO ZERO ZERO + + PLUS p->left p->right ZERO p->in + + CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus + p->left : q->right : q->is!=ZERO) ? q->in plus + p->is##q->left p->right##q->is p->is##q->is : p->right##q->left + ZERO + + OR longest common longest common (do p->is and substrings common to + leading trailing q->is have same p->in and q->in + (sub)sequence (sub)sequence length and + of p->left of p->right content) ? + and q->left and q->right p->is : NULL + + If there's anything else we recognize in the tree, all four sequences get set + to zero-length sequences. If there's something we don't recognize in the tree, + we just return a zero-length sequence. + + Break ties in favor of infrequent letters (choosing 'zzz' in preference to + 'aaa')? + + And. . .is it here or someplace that we might ponder "optimizations" such as + egrep 'psi|epsilon' -> egrep 'psi' + egrep 'pepsi|epsilon' -> egrep 'epsi' + (Yes, we now find "epsi" as a "string + that must occur", but we might also + simplify the *entire* r.e. being sought) + grep '[c]' -> grep 'c' + grep '(ab|a)b' -> grep 'ab' + grep 'ab*' -> grep 'a' + grep 'a*b' -> grep 'b' + + There are several issues: + + Is optimization easy (enough)? + + Does optimization actually accomplish anything, + or is the automaton you get from "psi|epsilon" (for example) + the same as the one you get from "psi" (for example)? + + Are optimizable r.e.'s likely to be used in real-life situations + (something like 'ab*' is probably unlikely; something like is + 'psi|epsilon' is likelier)? */ + + static char * + icatalloc(old, new) + char *old; + char *new; + { + char *result; + size_t oldsize, newsize; + + newsize = (new == NULL) ? 0 : strlen(new); + if (old == NULL) + oldsize = 0; + else if (newsize == 0) + return old; + else oldsize = strlen(old); + if (old == NULL) + result = (char *) malloc(newsize + 1); + else + result = (char *) realloc((void *) old, oldsize + newsize + 1); + if (result != NULL && new != NULL) + (void) strcpy(result + oldsize, new); + return result; + } + + static char * + icpyalloc(string) + char *string; + { + return icatalloc((char *) NULL, string); + } + + static char * + istrstr(lookin, lookfor) + char *lookin; + char *lookfor; + { + char *cp; + size_t len; + + len = strlen(lookfor); + for (cp = lookin; *cp != '\0'; ++cp) + if (strncmp(cp, lookfor, len) == 0) + return cp; + return NULL; + } + + static void + ifree(cp) + char *cp; + { + if (cp != NULL) + free(cp); + } + + static void + freelist(cpp) + char **cpp; + { + int i; + + if (cpp == NULL) + return; + for (i = 0; cpp[i] != NULL; ++i) + { + free(cpp[i]); + cpp[i] = NULL; + } + } + + static char ** + enlist(cpp, new, len) + char **cpp; + char *new; + size_t len; + { + int i, j; + + if (cpp == NULL) + return NULL; + if ((new = icpyalloc(new)) == NULL) + { + freelist(cpp); + return NULL; + } + new[len] = '\0'; + /* Is there already something in the list that's new (or longer)? */ + for (i = 0; cpp[i] != NULL; ++i) + if (istrstr(cpp[i], new) != NULL) + { + free(new); + return cpp; + } + /* Eliminate any obsoleted strings. */ + j = 0; + while (cpp[j] != NULL) + if (istrstr(new, cpp[j]) == NULL) + ++j; + else + { + free(cpp[j]); + if (--i == j) + break; + cpp[j] = cpp[i]; + cpp[i] = NULL; + } + /* Add the new string. */ + cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); + if (cpp == NULL) + return NULL; + cpp[i] = new; + cpp[i + 1] = NULL; + return cpp; + } + + /* Given pointers to two strings, return a pointer to an allocated + list of their distinct common substrings. Return NULL if something + seems wild. */ + static char ** + comsubs(left, right) + char *left; + char *right; + { + char **cpp; + char *lcp; + char *rcp; + size_t i, len; + + if (left == NULL || right == NULL) + return NULL; + cpp = (char **) malloc(sizeof *cpp); + if (cpp == NULL) + return NULL; + cpp[0] = NULL; + for (lcp = left; *lcp != '\0'; ++lcp) + { + len = 0; + rcp = index(right, *lcp); + while (rcp != NULL) + { + for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) + ; + if (i > len) + len = i; + rcp = index(rcp + 1, *lcp); + } + if (len == 0) + continue; + if ((cpp = enlist(cpp, lcp, len)) == NULL) + break; + } + return cpp; + } + + static char ** + addlists(old, new) + char **old; + char **new; + { + int i; + + if (old == NULL || new == NULL) + return NULL; + for (i = 0; new[i] != NULL; ++i) + { + old = enlist(old, new[i], strlen(new[i])); + if (old == NULL) + break; + } + return old; + } + + /* Given two lists of substrings, return a new list giving substrings + common to both. */ + static char ** + inboth(left, right) + char **left; + char **right; + { + char **both; + char **temp; + int lnum, rnum; + + if (left == NULL || right == NULL) + return NULL; + both = (char **) malloc(sizeof *both); + if (both == NULL) + return NULL; + both[0] = NULL; + for (lnum = 0; left[lnum] != NULL; ++lnum) + { + for (rnum = 0; right[rnum] != NULL; ++rnum) + { + temp = comsubs(left[lnum], right[rnum]); + if (temp == NULL) + { + freelist(both); + return NULL; + } + both = addlists(both, temp); + freelist(temp); + if (both == NULL) + return NULL; + } + } + return both; + } + + typedef struct + { + char **in; + char *left; + char *right; + char *is; + } must; + + static void + resetmust(mp) + must *mp; + { + mp->left[0] = mp->right[0] = mp->is[0] = '\0'; + freelist(mp->in); + } + + static void + dfamust(dfa) + struct dfa *dfa; + { + must *musts; + must *mp; + char *result; + int ri; + int i; + int exact; + token t; + static must must0; + struct dfamust *dm; + static char empty_string[] = ""; + + result = empty_string; + exact = 0; + musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); + if (musts == NULL) + return; + mp = musts; + for (i = 0; i <= dfa->tindex; ++i) + mp[i] = must0; + for (i = 0; i <= dfa->tindex; ++i) + { + mp[i].in = (char **) malloc(sizeof *mp[i].in); + mp[i].left = malloc(2); + mp[i].right = malloc(2); + mp[i].is = malloc(2); + if (mp[i].in == NULL || mp[i].left == NULL || + mp[i].right == NULL || mp[i].is == NULL) + goto done; + mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; + mp[i].in[0] = NULL; + } + #ifdef DEBUG + fprintf(stderr, "dfamust:\n"); + for (i = 0; i < dfa->tindex; ++i) + { + fprintf(stderr, " %d:", i); + prtok(dfa->tokens[i]); + } + putc('\n', stderr); + #endif + for (ri = 0; ri < dfa->tindex; ++ri) + { + switch (t = dfa->tokens[ri]) + { + case LPAREN: + case RPAREN: + goto done; /* "cannot happen" */ + case EMPTY: + case BEGLINE: + case ENDLINE: + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + case BACKREF: + resetmust(mp); + break; + case STAR: + case QMARK: + if (mp <= musts) + goto done; /* "cannot happen" */ + --mp; + resetmust(mp); + break; + case OR: + case ORTOP: + if (mp < &musts[2]) + goto done; /* "cannot happen" */ + { + char **new; + must *lmp; + must *rmp; + int j, ln, rn, n; + + rmp = --mp; + lmp = --mp; + /* Guaranteed to be. Unlikely, but. . . */ + if (strcmp(lmp->is, rmp->is) != 0) + lmp->is[0] = '\0'; + /* Left side--easy */ + i = 0; + while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) + ++i; + lmp->left[i] = '\0'; + /* Right side */ + ln = strlen(lmp->right); + rn = strlen(rmp->right); + n = ln; + if (n > rn) + n = rn; + for (i = 0; i < n; ++i) + if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) + break; + for (j = 0; j < i; ++j) + lmp->right[j] = lmp->right[(ln - i) + j]; + lmp->right[j] = '\0'; + new = inboth(lmp->in, rmp->in); + if (new == NULL) + goto done; + freelist(lmp->in); + free((char *) lmp->in); + lmp->in = new; + } + break; + case PLUS: + if (mp <= musts) + goto done; /* "cannot happen" */ + --mp; + mp->is[0] = '\0'; + break; + case END: + if (mp != &musts[1]) + goto done; /* "cannot happen" */ + for (i = 0; musts[0].in[i] != NULL; ++i) + if (strlen(musts[0].in[i]) > strlen(result)) + result = musts[0].in[i]; + if (strcmp(result, musts[0].is) == 0) + exact = 1; + goto done; + case CAT: + if (mp < &musts[2]) + goto done; /* "cannot happen" */ + { + must *lmp; + must *rmp; + + rmp = --mp; + lmp = --mp; + /* In. Everything in left, plus everything in + right, plus catenation of + left's right and right's left. */ + lmp->in = addlists(lmp->in, rmp->in); + if (lmp->in == NULL) + goto done; + if (lmp->right[0] != '\0' && + rmp->left[0] != '\0') + { + char *tp; + + tp = icpyalloc(lmp->right); + if (tp == NULL) + goto done; + tp = icatalloc(tp, rmp->left); + if (tp == NULL) + goto done; + lmp->in = enlist(lmp->in, tp, + strlen(tp)); + free(tp); + if (lmp->in == NULL) + goto done; + } + /* Left-hand */ + if (lmp->is[0] != '\0') + { + lmp->left = icatalloc(lmp->left, + rmp->left); + if (lmp->left == NULL) + goto done; + } + /* Right-hand */ + if (rmp->is[0] == '\0') + lmp->right[0] = '\0'; + lmp->right = icatalloc(lmp->right, rmp->right); + if (lmp->right == NULL) + goto done; + /* Guaranteed to be */ + if (lmp->is[0] != '\0' && rmp->is[0] != '\0') + { + lmp->is = icatalloc(lmp->is, rmp->is); + if (lmp->is == NULL) + goto done; + } + else + lmp->is[0] = '\0'; + } + break; + default: + if (t < END) + { + /* "cannot happen" */ + goto done; + } + else if (t == '\0') + { + /* not on *my* shift */ + goto done; + } + else if (t >= CSET) + { + /* easy enough */ + resetmust(mp); + } + else + { + /* plain character */ + resetmust(mp); + mp->is[0] = mp->left[0] = mp->right[0] = t; + mp->is[1] = mp->left[1] = mp->right[1] = '\0'; + mp->in = enlist(mp->in, mp->is, (size_t)1); + if (mp->in == NULL) + goto done; + } + break; + } + #ifdef DEBUG + fprintf(stderr, " node: %d:", ri); + prtok(dfa->tokens[ri]); + fprintf(stderr, "\n in:"); + for (i = 0; mp->in[i]; ++i) + fprintf(stderr, " \"%s\"", mp->in[i]); + fprintf(stderr, "\n is: \"%s\"\n", mp->is); + fprintf(stderr, " left: \"%s\"\n", mp->left); + fprintf(stderr, " right: \"%s\"\n", mp->right); + #endif + ++mp; + } + done: + if (strlen(result)) + { + dm = (struct dfamust *) malloc(sizeof (struct dfamust)); + dm->exact = exact; + dm->must = malloc(strlen(result) + 1); + strcpy(dm->must, result); + dm->next = dfa->musts; + dfa->musts = dm; + } + mp = musts; + for (i = 0; i <= dfa->tindex; ++i) + { + freelist(mp[i].in); + ifree((char *) mp[i].in); + ifree(mp[i].left); + ifree(mp[i].right); + ifree(mp[i].is); + } + free((char *) mp); + } diff -crN gawk-2.15.3/dfa.h gawk-2.15.4/dfa.h *** gawk-2.15.3/dfa.h Wed Dec 31 19:00:00 1969 --- gawk-2.15.4/dfa.h Tue Jan 4 16:18:17 1994 *************** *** 0 **** --- 1,360 ---- + /* dfa.h - declarations for GNU deterministic regexp compiler + Copyright (C) 1988 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + + /* Written June, 1988 by Mike Haertel */ + + /* FIXME: + 2. We should not export so much of the DFA internals. + In addition to clobbering modularity, we eat up valuable + name space. */ + + /* Number of bits in an unsigned char. */ + #define CHARBITS 8 + + /* First integer value that is greater than any character code. */ + #define NOTCHAR (1 << CHARBITS) + + /* INTBITS need not be exact, just a lower bound. */ + #define INTBITS (CHARBITS * sizeof (int)) + + /* Number of ints required to hold a bit for every character. */ + #define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS) + + /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */ + typedef int charclass[CHARCLASS_INTS]; + + /* The regexp is parsed into an array of tokens in postfix form. Some tokens + are operators and others are terminal symbols. Most (but not all) of these + codes are returned by the lexical analyzer. */ + + typedef enum + { + END = -1, /* END is a terminal symbol that matches the + end of input; any value of END or less in + the parse tree is such a symbol. Accepting + states of the DFA are those that would have + a transition on END. */ + + /* Ordinary character values are terminal symbols that match themselves. */ + + EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches + the empty string. */ + + BACKREF, /* BACKREF is generated by \; it + it not completely handled. If the scanner + detects a transition on backref, it returns + a kind of "semi-success" indicating that + the match will have to be verified with + a backtracking matcher. */ + + BEGLINE, /* BEGLINE is a terminal symbol that matches + the empty string if it is at the beginning + of a line. */ + + ENDLINE, /* ENDLINE is a terminal symbol that matches + the empty string if it is at the end of + a line. */ + + BEGWORD, /* BEGWORD is a terminal symbol that matches + the empty string if it is at the beginning + of a word. */ + + ENDWORD, /* ENDWORD is a terminal symbol that matches + the empty string if it is at the end of + a word. */ + + LIMWORD, /* LIMWORD is a terminal symbol that matches + the empty string if it is at the beginning + or the end of a word. */ + + NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that + matches the empty string if it is not at + the beginning or end of a word. */ + + QMARK, /* QMARK is an operator of one argument that + matches zero or one occurences of its + argument. */ + + STAR, /* STAR is an operator of one argument that + matches the Kleene closure (zero or more + occurrences) of its argument. */ + + PLUS, /* PLUS is an operator of one argument that + matches the positive closure (one or more + occurrences) of its argument. */ + + REPMN, /* REPMN is a lexical token corresponding + to the {m,n} construct. REPMN never + appears in the compiled token vector. */ + + CAT, /* CAT is an operator of two arguments that + matches the concatenation of its + arguments. CAT is never returned by the + lexical analyzer. */ + + OR, /* OR is an operator of two arguments that + matches either of its arguments. */ + + ORTOP, /* OR at the toplevel in the parse tree. + This is used for a boyer-moore heuristic. */ + + LPAREN, /* LPAREN never appears in the parse tree, + it is only a lexeme. */ + + RPAREN, /* RPAREN never appears in the parse tree. */ + + CSET /* CSET and (and any value greater) is a + terminal symbol that matches any of a + class of characters. */ + } token; + + /* Sets are stored in an array in the compiled dfa; the index of the + array corresponding to a given set token is given by SET_INDEX(t). */ + #define SET_INDEX(t) ((t) - CSET) + + /* Sometimes characters can only be matched depending on the surrounding + context. Such context decisions depend on what the previous character + was, and the value of the current (lookahead) character. Context + dependent constraints are encoded as 8 bit integers. Each bit that + is set indicates that the constraint succeeds in the corresponding + context. + + bit 7 - previous and current are newlines + bit 6 - previous was newline, current isn't + bit 5 - previous wasn't newline, current is + bit 4 - neither previous nor current is a newline + bit 3 - previous and current are word-constituents + bit 2 - previous was word-constituent, current isn't + bit 1 - previous wasn't word-constituent, current is + bit 0 - neither previous nor current is word-constituent + + Word-constituent characters are those that satisfy isalnum(). + + The macro SUCCEEDS_IN_CONTEXT determines whether a a given constraint + succeeds in a particular context. Prevn is true if the previous character + was a newline, currn is true if the lookahead character is a newline. + Prevl and currl similarly depend upon whether the previous and current + characters are word-constituent letters. */ + #define MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \ + ((constraint) & 1 << (((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4)) + #define MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \ + ((constraint) & 1 << (((prevl) ? 2 : 0) + ((currl) ? 1 : 0))) + #define SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \ + (MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \ + && MATCHES_LETTER_CONTEXT(constraint, prevl, currl)) + + /* The following macros give information about what a constraint depends on. */ + #define PREV_NEWLINE_DEPENDENT(constraint) \ + (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30)) + #define PREV_LETTER_DEPENDENT(constraint) \ + (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03)) + + /* Tokens that match the empty string subject to some constraint actually + work by applying that constraint to determine what may follow them, + taking into account what has gone before. The following values are + the constraints corresponding to the special tokens previously defined. */ + #define NO_CONSTRAINT 0xff + #define BEGLINE_CONSTRAINT 0xcf + #define ENDLINE_CONSTRAINT 0xaf + #define BEGWORD_CONSTRAINT 0xf2 + #define ENDWORD_CONSTRAINT 0xf4 + #define LIMWORD_CONSTRAINT 0xf6 + #define NOTLIMWORD_CONSTRAINT 0xf9 + + /* States of the recognizer correspond to sets of positions in the parse + tree, together with the constraints under which they may be matched. + So a position is encoded as an index into the parse tree together with + a constraint. */ + typedef struct + { + unsigned index; /* Index into the parse array. */ + unsigned constraint; /* Constraint for matching this position. */ + } position; + + /* Sets of positions are stored as arrays. */ + typedef struct + { + position *elems; /* Elements of this position set. */ + int nelem; /* Number of elements in this set. */ + } position_set; + + /* A state of the dfa consists of a set of positions, some flags, + and the token value of the lowest-numbered position of the state that + contains an END token. */ + typedef struct + { + int hash; /* Hash of the positions of this state. */ + position_set elems; /* Positions this state could match. */ + char newline; /* True if previous state matched newline. */ + char letter; /* True if previous state matched a letter. */ + char backref; /* True if this state matches a \. */ + unsigned char constraint; /* Constraint for this state to accept. */ + int first_end; /* Token value of the first END in elems. */ + } dfa_state; + + /* Element of a list of strings, at least one of which is known to + appear in any R.E. matching the DFA. */ + struct dfamust + { + int exact; + char *must; + struct dfamust *next; + }; + + /* A compiled regular expression. */ + struct dfa + { + /* Stuff built by the scanner. */ + charclass *charclasses; /* Array of character sets for CSET tokens. */ + int cindex; /* Index for adding new charclasses. */ + int calloc; /* Number of charclasses currently allocated. */ + + /* Stuff built by the parser. */ + token *tokens; /* Postfix parse array. */ + int tindex; /* Index for adding new tokens. */ + int talloc; /* Number of tokens currently allocated. */ + int depth; /* Depth required of an evaluation stack + used for depth-first traversal of the + parse tree. */ + int nleaves; /* Number of leaves on the parse tree. */ + int nregexps; /* Count of parallel regexps being built + with dfaparse(). */ + + /* Stuff owned by the state builder. */ + dfa_state *states; /* States of the dfa. */ + int sindex; /* Index for adding new states. */ + int salloc; /* Number of states currently allocated. */ + + /* Stuff built by the structure analyzer. */ + position_set *follows; /* Array of follow sets, indexed by position + index. The follow of a position is the set + of positions containing characters that + could conceivably follow a character + matching the given position in a string + matching the regexp. Allocated to the + maximum possible position index. */ + int searchflag; /* True if we are supposed to build a searching + as opposed to an exact matcher. A searching + matcher finds the first and shortest string + matching a regexp anywhere in the buffer, + whereas an exact matcher finds the longest + string matching, but anchored to the + beginning of the buffer. */ + + /* Stuff owned by the executor. */ + int tralloc; /* Number of transition tables that have + slots so far. */ + int trcount; /* Number of transition tables that have + actually been built. */ + int **trans; /* Transition tables for states that can + never accept. If the transitions for a + state have not yet been computed, or the + state could possibly accept, its entry in + this table is NULL. */ + int **realtrans; /* Trans always points to realtrans + 1; this + is so trans[-1] can contain NULL. */ + int **fails; /* Transition tables after failing to accept + on a state that potentially could do so. */ + int *success; /* Table of acceptance conditions used in + dfaexec and computed in build_state. */ + int *newlines; /* Transitions on newlines. The entry for a + newline in any transition table is always + -1 so we can count lines without wasting + too many cycles. The transition for a + newline is stored separately and handled + as a special case. Newline is also used + as a sentinel at the end of the buffer. */ + struct dfamust *musts; /* List of strings, at least one of which + is known to appear in any r.e. matching + the dfa. */ + }; + + /* Some macros for user access to dfa internals. */ + + /* ACCEPTING returns true if s could possibly be an accepting state of r. */ + #define ACCEPTING(s, r) ((r).states[s].constraint) + + /* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the + specified context. */ + #define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, dfa) \ + SUCCEEDS_IN_CONTEXT((dfa).states[state].constraint, \ + prevn, currn, prevl, currl) + + /* FIRST_MATCHING_REGEXP returns the index number of the first of parallel + regexps that a given state could accept. Parallel regexps are numbered + starting at 1. */ + #define FIRST_MATCHING_REGEXP(state, dfa) (-(dfa).states[state].first_end) + + /* Entry points. */ + + #ifdef __STDC__ + + /* dfasyntax() takes two arguments; the first sets the syntax bits described + earlier in this file, and the second sets the case-folding flag. */ + extern void dfasyntax(reg_syntax_t, int); + + /* Compile the given string of the given length into the given struct dfa. + Final argument is a flag specifying whether to build a searching or an + exact matcher. */ + extern void dfacomp(char *, size_t, struct dfa *, int); + + /* Execute the given struct dfa on the buffer of characters. The + first char * points to the beginning, and the second points to the + first character after the end of the buffer, which must be a writable + place so a sentinel end-of-buffer marker can be stored there. The + second-to-last argument is a flag telling whether to allow newlines to + be part of a string matching the regexp. The next-to-last argument, + if non-NULL, points to a place to increment every time we see a + newline. The final argument, if non-NULL, points to a flag that will + be set if further examination by a backtracking matcher is needed in + order to verify backreferencing; otherwise the flag will be cleared. + Returns NULL if no match is found, or a pointer to the first + character after the first & shortest matching string in the buffer. */ + extern char *dfaexec(struct dfa *, char *, char *, int, int *, int *); + + /* Free the storage held by the components of a struct dfa. */ + extern void dfafree(struct dfa *); + + /* Entry points for people who know what they're doing. */ + + /* Initialize the components of a struct dfa. */ + extern void dfainit(struct dfa *); + + /* Incrementally parse a string of given length into a struct dfa. */ + extern void dfaparse(char *, size_t, struct dfa *); + + /* Analyze a parsed regexp; second argument tells whether to build a searching + or an exact matcher. */ + extern void dfaanalyze(struct dfa *, int); + + /* Compute, for each possible character, the transitions out of a given + state, storing them in an array of integers. */ + extern void dfastate(int, struct dfa *, int []); + + /* Error handling. */ + + /* dfaerror() is called by the regexp routines whenever an error occurs. It + takes a single argument, a NUL-terminated string describing the error. + The default dfaerror() prints the error message to stderr and exits. + The user can provide a different dfafree() if so desired. */ + extern void dfaerror(const char *); + + #else /* ! __STDC__ */ + extern void dfasyntax(), dfacomp(), dfafree(), dfainit(), dfaparse(); + extern void dfaanalyze(), dfastate(), dfaerror(); + extern char *dfaexec(); + #endif /* ! __STDC__ */ diff -crN gawk-2.15.3/eval.c gawk-2.15.4/eval.c *** gawk-2.15.3/eval.c Tue Nov 2 06:33:29 1993 --- gawk-2.15.4/eval.c Sat Jan 15 22:34:40 1994 *************** *** 3,9 **** */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 318,324 **** break; case Node_K_delete: ! do_delete(tree->lnode, tree->rnode); break; case Node_K_next: --- 318,327 ---- break; case Node_K_delete: ! if (tree->rnode != NULL) ! do_delete(tree->lnode, tree->rnode); ! else ! assoc_clear(tree->lnode); break; case Node_K_next: *************** *** 967,984 **** /* should we free arg->var_value ? */ arg->var_array = n->var_array; arg->type = Node_var_array; } ! unref(n->lnode); freenode(n); count--; } while (count-- > 0) { n = *sp++; /* if n is an (local) array, all the elements should be freed */ ! if (n->type == Node_var_array) { assoc_clear(n); - free(n->var_array); - } unref(n->lnode); freenode(n); } --- 970,989 ---- /* should we free arg->var_value ? */ arg->var_array = n->var_array; arg->type = Node_var_array; + arg->array_size = n->array_size; + arg->table_size = n->table_size; } ! /* n->lnode overlays the array size, don't unref it if array */ ! if (n->type != Node_var_array) ! unref(n->lnode); freenode(n); count--; } while (count-- > 0) { n = *sp++; /* if n is an (local) array, all the elements should be freed */ ! if (n->type == Node_var_array) assoc_clear(n); unref(n->lnode); freenode(n); } diff -crN gawk-2.15.3/field.c gawk-2.15.4/field.c *** gawk-2.15.3/field.c Tue Nov 2 06:34:11 1993 --- gawk-2.15.4/field.c Wed Dec 29 10:32:45 1993 *************** *** 3,9 **** */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 25,41 **** #include "awk.h" static int (*parse_field) P((int, char **, int, NODE *, ! Regexp *, void (*)(), NODE *)); static void rebuild_record P((void)); static int re_parse_field P((int, char **, int, NODE *, ! Regexp *, void (*)(), NODE *)); static int def_parse_field P((int, char **, int, NODE *, ! Regexp *, void (*)(), NODE *)); static int sc_parse_field P((int, char **, int, NODE *, ! Regexp *, void (*)(), NODE *)); static int fw_parse_field P((int, char **, int, NODE *, ! Regexp *, void (*)(), NODE *)); static void set_element P((int, char *, int, NODE *)); static void grow_fields_arr P((int num)); static void set_field P((int num, char *str, int len, NODE *dummy)); --- 25,43 ---- #include "awk.h" + typedef void (* Setfunc) P((int, char*, int, NODE *)); + static int (*parse_field) P((int, char **, int, NODE *, ! Regexp *, Setfunc, NODE *)); static void rebuild_record P((void)); static int re_parse_field P((int, char **, int, NODE *, ! Regexp *, Setfunc, NODE *)); static int def_parse_field P((int, char **, int, NODE *, ! Regexp *, Setfunc, NODE *)); static int sc_parse_field P((int, char **, int, NODE *, ! Regexp *, Setfunc, NODE *)); static int fw_parse_field P((int, char **, int, NODE *, ! Regexp *, Setfunc, NODE *)); static void set_element P((int, char *, int, NODE *)); static void grow_fields_arr P((int num)); static void set_field P((int num, char *str, int len, NODE *dummy)); *************** *** 226,232 **** int len; NODE *fs; Regexp *rp; ! void (*set) (); /* routine to set the value of the parsed field */ NODE *n; { register char *scan = *buf; --- 228,234 ---- int len; NODE *fs; Regexp *rp; ! Setfunc set; /* routine to set the value of the parsed field */ NODE *n; { register char *scan = *buf; *************** *** 244,252 **** scan++; field = scan; while (scan < end ! && research(rp, scan, 0, (int)(end - scan), 1) != -1 && nf < up_to) { ! if (REEND(rp, scan) == RESTART(rp, scan)) { /* null match */ scan++; if (scan == end) { (*set)(++nf, field, (int)(scan - field), n); --- 246,254 ---- scan++; field = scan; while (scan < end ! && research(rp, scan, 0, (end - scan), 1) != -1 && nf < up_to) { ! if (REEND(rp, scan) == RESTART(rp, scan)) { /* null match */ scan++; if (scan == end) { (*set)(++nf, field, (int)(scan - field), n); *************** *** 282,288 **** int len; NODE *fs; Regexp *rp; ! void (*set) (); /* routine to set the value of the parsed field */ NODE *n; { register char *scan = *buf; --- 284,290 ---- int len; NODE *fs; Regexp *rp; ! Setfunc set; /* routine to set the value of the parsed field */ NODE *n; { register char *scan = *buf; *************** *** 336,342 **** int len; NODE *fs; Regexp *rp; ! void (*set) (); /* routine to set the value of the parsed field */ NODE *n; { register char *scan = *buf; --- 338,344 ---- int len; NODE *fs; Regexp *rp; ! Setfunc set; /* routine to set the value of the parsed field */ NODE *n; { register char *scan = *buf; *************** *** 389,395 **** int len; NODE *fs; Regexp *rp; ! void (*set) (); /* routine to set the value of the parsed field */ NODE *n; { register char *scan = *buf; --- 391,397 ---- int len; NODE *fs; Regexp *rp; ! Setfunc set; /* routine to set the value of the parsed field */ NODE *n; { register char *scan = *buf; *************** *** 514,520 **** NODE *fs; char *s; int (*parseit)P((int, char **, int, NODE *, ! Regexp *, void (*)(), NODE *)); Regexp *rp = NULL; t1 = tree_eval(tree->lnode); --- 516,522 ---- NODE *fs; char *s; int (*parseit)P((int, char **, int, NODE *, ! Regexp *, Setfunc, NODE *)); Regexp *rp = NULL; t1 = tree_eval(tree->lnode); diff -crN gawk-2.15.3/gawk.1 gawk-2.15.4/gawk.1 *** gawk-2.15.3/gawk.1 Thu Nov 4 06:21:01 1993 --- gawk-2.15.4/gawk.1 Sun Dec 26 13:32:06 1993 *************** *** 1,7 **** .ds PX \s-1POSIX\s+1 .ds UX \s-1UNIX\s+1 .ds AN \s-1ANSI\s+1 ! .TH GAWK 1 "Nov 4 1993" "Free Software Foundation" "Utility Commands" .SH NAME gawk \- pattern scanning and processing language .SH SYNOPSIS --- 1,7 ---- .ds PX \s-1POSIX\s+1 .ds UX \s-1UNIX\s+1 .ds AN \s-1ANSI\s+1 ! .TH GAWK 1 "Dec 24 1993" "Free Software Foundation" "Utility Commands" .SH NAME gawk \- pattern scanning and processing language .SH SYNOPSIS *************** *** 71,76 **** --- 71,81 ---- Each .B \-W option has a corresponding GNU style long option, as detailed below. + Arguments to GNU style long options are either joined with the option + by an + .B = + sign, with no intervening spaces, or they may be provided in the + next command line argument. .PP .I Gawk accepts the following options. *************** *** 114,119 **** --- 119,144 ---- (or .BR \-\^\-file ) options may be used. + .TP + .PD 0 + .BI \-mf= NNN + .TP + .BI \-mr= NNN + Set various memory limits to the value + .IR NNN . + The + .B f + flag sets the maximum number of fields, and the + .B r + flag sets the maximum record size. These two flags and the + .B \-m + option are from the AT&T Bell Labs research version of \*(UX + .IR awk . + They are ignored by + .IR gawk , + since + .I gawk + has no pre-defined limits. .TP \w'\fB\-\^\-copyright\fR'u+1n .PD 0 .B "\-W compat" *************** *** 158,163 **** --- 183,190 ---- .B \-\^\-usage Print a relatively short summary of the available options on the error output. + Per the GNU Coding Standards, these options cause an immediate, + successful exit. .TP .PD 0 .B "\-W lint" *************** *** 248,253 **** --- 275,282 ---- on your system is up to date with respect to whatever the Free Software Foundation is distributing. + Per the GNU Coding Standards, these options cause an immediate, + successful exit. .TP .B \-\^\- Signal the end of options. This is useful to allow further arguments to the *************** *** 255,261 **** This is mainly for consistency with the argument parsing convention used by most other \*(PX programs. .PP ! Any other options are flagged as illegal, but are otherwise ignored. .SH AWK PROGRAM EXECUTION .PP An AWK program consists of a sequence of pattern-action statements --- 284,296 ---- This is mainly for consistency with the argument parsing convention used by most other \*(PX programs. .PP ! In compatibility mode, ! any other options are flagged as illegal, but are otherwise ignored. ! In normal operation, as long as program text has been supplied, unknown ! options are passed on to the AWK program in the ! .B ARGV ! array for processing. This is particularly useful for running AWK ! programs via the ``#!'' executable interpreter mechansim. .SH AWK PROGRAM EXECUTION .PP An AWK program consists of a sequence of pattern-action statements *************** *** 270,292 **** .I Gawk first reads the program source from the .IR program-file (s) ! if specified, or from the first non-option argument on the command line. The .B \-f ! option may be used multiple times on the command line. .I Gawk will read the program text as if all the .IR program-file s had been concatenated together. This is useful for building libraries of AWK functions, without having to include them in each new AWK ! program that uses them. To use a library function in a file from a ! program typed in on the command line, specify ! .B /dev/tty ! as one of the ! .IR program-file s, ! type your program, and end it with a ! .B ^D ! (control-d). .PP The environment variable .B AWKPATH --- 305,327 ---- .I Gawk first reads the program source from the .IR program-file (s) ! if specified, ! from arguments to ! .BR "\-W source=" , ! or from the first non-option argument on the command line. The .B \-f ! and ! .B "\-W source=" ! options may be used multiple times on the command line. .I Gawk will read the program text as if all the .IR program-file s + and command line source texts had been concatenated together. This is useful for building libraries of AWK functions, without having to include them in each new AWK ! program that uses them. It also provides the ability to mix library ! functions with command line programs. .PP The environment variable .B AWKPATH *************** *** 302,312 **** .I Gawk executes AWK programs in the following order. First, .I gawk compiles the program into an internal form. ! Next, all variable assignments specified via the ! .B \-v ! option are performed. Then, .I gawk executes the code in the .B BEGIN --- 337,349 ---- .I Gawk executes AWK programs in the following order. First, + all variable assignments specified via the + .B \-v + option are performed. + Next, .I gawk compiles the program into an internal form. ! Then, .I gawk executes the code in the .B BEGIN *************** *** 359,366 **** AWK variables are dynamic; they come into existence when they are first used. Their values are either floating-point numbers or strings, or both, ! depending upon how they are used. AWK also has one dimension ! arrays; multiply dimensioned arrays may be simulated. Several pre-defined variables are set as a program runs; these will be described as needed and summarized below. .SS Fields --- 396,403 ---- AWK variables are dynamic; they come into existence when they are first used. Their values are either floating-point numbers or strings, or both, ! depending upon how they are used. AWK also has one dimensional ! arrays; arrays with multiple dimensions may be simulated. Several pre-defined variables are set as a program runs; these will be described as needed and summarized below. .SS Fields *************** *** 435,440 **** --- 472,478 ---- .B $0 to be recomputed, with the fields being separated by the value of .BR OFS . + References to negative numbered fields cause a fatal error. .SS Built-in Variables .PP AWK's built-in variables are: *************** *** 482,488 **** during a read for .BR getline , or during a ! .BR close , then .B ERRNO will contain --- 520,526 ---- during a read for .BR getline , or during a ! .BR close() , then .B ERRNO will contain *************** *** 649,654 **** --- 687,695 ---- An element may be deleted from an array using the .B delete statement. + The + .B delete + statement may also be used to delete the entire contents of an array. .SS Variable Typing And Conversion .PP Variables and fields *************** *** 685,691 **** .PP the variable .B b ! has a value of \fB"12"\fR and not \fB"12.00"\fR. .PP .I Gawk performs comparisons as follows: --- 726,732 ---- .PP the variable .B b ! has a string value of \fB"12"\fR and not \fB"12.00"\fR. .PP .I Gawk performs comparisons as follows: *************** *** 814,820 **** .PP The .IB pattern1 ", " pattern2 ! form of an expression is called a range pattern. It matches all input records starting with a line that matches .IR pattern1 , and continuing until a record that matches --- 855,862 ---- .PP The .IB pattern1 ", " pattern2 ! form of an expression is called a ! .IR "range pattern" . It matches all input records starting with a line that matches .IR pattern1 , and continuing until a record that matches *************** *** 987,992 **** --- 1029,1035 ---- \fBbreak\fR \fBcontinue\fR \fBdelete \fIarray\^\fB[\^\fIindex\^\fB]\fR + \fBdelete \fIarray\^\fR \fBexit\fR [ \fIexpression\fR ] \fB{ \fIstatements \fB} .fi *************** *** 1051,1060 **** --- 1094,1113 ---- .TP .BI print " expr-list" Prints expressions. + Each expression is separated by the value of the + .B OFS + variable. The output record is terminated with the value of the + .B ORS + variable. .TP .BI print " expr-list" " >" file Prints expressions on .IR file . + Each expression is separated by the value of the + .B OFS + variable. The output record is terminated with the value of the + .B ORS + variable. .TP .BI printf " fmt, expr-list" Format and print. *************** *** 1083,1090 **** .IB command " | getline" pipes into .BR getline . ! .BR Getline ! will return 0 on end of file, and \-1 on an error. .SS The \fIprintf\fP\^ Statement .PP The AWK versions of the --- 1136,1144 ---- .IB command " | getline" pipes into .BR getline . ! The ! .BR getline ! command will return 0 on end of file, and \-1 on an error. .SS The \fIprintf\fP\^ Statement .PP The AWK versions of the *************** *** 1158,1163 **** --- 1212,1218 ---- The field should be padded to this width. If the number has a leading zero, then the field will be padded with zeros. Otherwise it is padded with blanks. + This applies even to the non-numeric output formats. .TP .BI . prec A number indicating the maximum width of strings or digits to the right *************** *** 1234,1240 **** system call. If there are any additional fields, they are the group IDs returned by .IR getgroups (2). ! (Multiple groups may not be supported on all systems.) .TP .B /dev/stdin The standard input. --- 1289,1295 ---- system call. If there are any additional fields, they are the group IDs returned by .IR getgroups (2). ! Multiple groups may not be supported on all systems. .TP .B /dev/stdin The standard input. *************** *** 1365,1370 **** --- 1420,1428 ---- is omitted, .B FS is used instead. + The array + .I a + is cleared first. .TP .BI sprintf( fmt , " expr-list" ) prints *************** *** 1482,1492 **** As in \*(AN C, all following hexadecimal digits are considered part of the escape sequence. (This feature should tell us something about language design by committee.) ! E.g., "\ex1B" is the \s-1ASCII\s+1 \s-1ESC\s+1 (escape) character. .TP .BI \e ddd The character represented by the 1-, 2-, or 3-digit sequence of octal ! digits. E.g. "\e033" is the \s-1ASCII\s+1 \s-1ESC\s+1 (escape) character. .TP .BI \e c The literal character --- 1540,1550 ---- As in \*(AN C, all following hexadecimal digits are considered part of the escape sequence. (This feature should tell us something about language design by committee.) ! E.g., \fB"\ex1B"\fR is the \s-1ASCII\s+1 \s-1ESC\s+1 (escape) character. .TP .BI \e ddd The character represented by the 1-, 2-, or 3-digit sequence of octal ! digits. E.g. \fB"\e033"\fR is the \s-1ASCII\s+1 \s-1ESC\s+1 (escape) character. .TP .BI \e c The literal character *************** *** 1567,1573 **** .ft R .fi .SH SEE ALSO ! .IR egrep (1) .PP .IR "The AWK Programming Language" , Alfred V. Aho, Brian W. Kernighan, Peter J. Weinberger, --- 1625,1639 ---- .ft R .fi .SH SEE ALSO ! .IR egrep (1), ! .IR getpid (2), ! .IR getppid (2), ! .IR getpgrp (2), ! .IR getuid (2), ! .IR geteuid (2), ! .IR getgid (2), ! .IR getegid (2), ! .IR getgroups (2) .PP .IR "The AWK Programming Language" , Alfred V. Aho, Brian W. Kernighan, Peter J. Weinberger, *************** *** 1615,1621 **** When processing arguments, .I gawk uses the special option ``\fB\-\^\-\fP'' to signal the end of ! arguments, and warns about, but otherwise ignores, undefined options. .PP The AWK book does not define the return value of .BR srand() . --- 1681,1691 ---- When processing arguments, .I gawk uses the special option ``\fB\-\^\-\fP'' to signal the end of ! arguments. ! In compatibility mode, it will warn about, but otherwise ignore, ! undefined options. ! In normal operation, such arguments are passed on to the AWK program for ! it to process. .PP The AWK book does not define the return value of .BR srand() . *************** *** 1711,1716 **** --- 1781,1791 ---- The use of .B "next file" to abandon processing of the current input file. + .TP + \(bu + The use of + .BI delete " array" + to delete the entire contents of an array. .RE .PP The AWK book does not define the return value of the *************** *** 1738,1744 **** will be set to the tab character. Since this is a rather ugly special case, it is not the default behavior. This behavior also does not occur if ! .B \-Wposix has been specified. .ig .PP --- 1813,1819 ---- will be set to the tab character. Since this is a rather ugly special case, it is not the default behavior. This behavior also does not occur if ! .B "\-W posix" has been specified. .ig .PP *************** *** 1790,1796 **** This feature is marked as ``deprecated'' in the \*(PX standard, and .I gawk will issue a warning about its use if ! .B \-Wlint is specified on the command line. .PP The other feature is the use of the --- 1865,1871 ---- This feature is marked as ``deprecated'' in the \*(PX standard, and .I gawk will issue a warning about its use if ! .B "\-W lint" is specified on the command line. .PP The other feature is the use of the *************** *** 1806,1812 **** statement. .I Gawk will support this usage if ! .B \-Wposix has not been specified. .SH BUGS The --- 1881,1887 ---- statement. .I Gawk will support this usage if ! .B "\-W posix" has not been specified. .SH BUGS The *************** *** 1849,1854 **** --- 1924,1930 ---- and .B \-e options of the 2.11 version are no longer recognized. + This fact will not even be documented in the manual page for version 2.16. .SH AUTHORS The original version of \*(UX .I awk *************** *** 1872,1877 **** --- 1948,1955 ---- The initial DOS port was done by Conrad Kwok and Scott Garfinkle. Scott Deifik is the current DOS maintainer. Pat Rankin did the port to VMS, and Michal Jaegermann did the port to the Atari ST. + The port to OS/2 was done by Kai Uwe Rommel, with contributions and + help from Darrel Hankerson. .SH ACKNOWLEDGEMENTS Brian Kernighan of Bell Labs provided valuable assistance during testing and debugging. diff -crN gawk-2.15.3/getopt.h gawk-2.15.4/getopt.h *** gawk-2.15.3/getopt.h Mon Oct 18 22:31:49 1993 --- gawk-2.15.4/getopt.h Wed Nov 24 09:49:50 1993 *************** *** 76,82 **** struct option { ! #if __STDC__ const char *name; #else char *name; --- 76,82 ---- struct option { ! #ifdef __STDC__ const char *name; #else char *name; *************** *** 94,100 **** #define required_argument 1 #define optional_argument 2 ! #if __STDC__ #if defined(__GNU_LIBRARY__) /* Many other libraries have conflicting prototypes for getopt, with differences in the consts, in stdlib.h. To avoid compilation --- 94,100 ---- #define required_argument 1 #define optional_argument 2 ! #ifdef __STDC__ #if defined(__GNU_LIBRARY__) /* Many other libraries have conflicting prototypes for getopt, with differences in the consts, in stdlib.h. To avoid compilation diff -crN gawk-2.15.3/io.c gawk-2.15.4/io.c *** gawk-2.15.3/io.c Sun Nov 7 11:59:31 1993 --- gawk-2.15.4/io.c Thu Jan 13 21:15:32 1994 *************** *** 3,9 **** */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 56,69 **** static int wait_any P((int interesting)); #endif static IOBUF *gawk_popen P((char *cmd, struct redirect *rp)); ! static IOBUF *iop_open P((char *file, char *how)); static int gawk_pclose P((struct redirect *rp)); ! static int do_pathopen P((char *file)); ! static int str2mode P((char *mode)); static void spec_setup P((IOBUF *iop, int len, int allocate)); ! static int specfdopen P((IOBUF *iop, char *name, char *mode)); ! static int pidopen P((IOBUF *iop, char *name, char *mode)); ! static int useropen P((IOBUF *iop, char *name, char *mode)); extern FILE *fdopen(); --- 56,69 ---- static int wait_any P((int interesting)); #endif static IOBUF *gawk_popen P((char *cmd, struct redirect *rp)); ! static IOBUF *iop_open P((const char *file, const char *how)); static int gawk_pclose P((struct redirect *rp)); ! static int do_pathopen P((const char *file)); ! static int str2mode P((const char *mode)); static void spec_setup P((IOBUF *iop, int len, int allocate)); ! static int specfdopen P((IOBUF *iop, const char *name, const char *mode)); ! static int pidopen P((IOBUF *iop, const char *name, const char *mode)); ! static int useropen P((IOBUF *iop, const char *name, const char *mode)); extern FILE *fdopen(); *************** *** 262,267 **** --- 262,270 ---- if (inrec(iop) == 0) while (interpret(expression_value) && inrec(iop) == 0) ; + /* recover any space from C based alloca */ + (void) alloca(0); + if (exiting) break; } *************** *** 278,287 **** register char *str; int tflag = 0; int outflag = 0; ! char *direction = "to"; ! char *mode; int fd; ! char *what = NULL; switch (tree->type) { case Node_redirect_append: --- 281,290 ---- register char *str; int tflag = 0; int outflag = 0; ! const char *direction = "to"; ! const char *mode; int fd; ! const char *what = NULL; switch (tree->type) { case Node_redirect_append: *************** *** 394,402 **** rp->fp = stdout; else if (fd == fileno(stderr)) rp->fp = stderr; ! else ! rp->fp = fdopen(fd, mode); ! if (isatty(fd)) rp->flag |= RED_NOBUF; } } --- 397,409 ---- rp->fp = stdout; else if (fd == fileno(stderr)) rp->fp = stderr; ! else { ! rp->fp = fdopen(fd, (char *) mode); ! /* don't leak file descriptors */ ! if (rp->fp == NULL) ! close(fd); ! } ! if (rp->fp != NULL && isatty(fd)) rp->flag |= RED_NOBUF; } } *************** *** 589,595 **** static int str2mode(mode) ! char *mode; { int ret; --- 596,602 ---- static int str2mode(mode) ! const char *mode; { int ret; *************** *** 605,611 **** --- 612,620 ---- case 'a': ret = O_WRONLY|O_APPEND|O_CREAT; break; + default: + ret = 0; /* lint */ cant_happen(); } return ret; *************** *** 622,631 **** int devopen(name, mode) ! char *name, *mode; { int openfd = INVALID_HANDLE; ! char *cp, *ptr; int flag = 0; struct stat buf; extern double strtod(); --- 631,640 ---- int devopen(name, mode) ! const char *name, *mode; { int openfd = INVALID_HANDLE; ! const char *cp, *ptr; int flag = 0; struct stat buf; extern double strtod(); *************** *** 642,648 **** if (STREQ(name, "-")) openfd = fileno(stdin); ! else if (STREQN(name, "/dev/", 5) && stat(name, &buf) == -1) { cp = name + 5; if (STREQ(cp, "stdin") && (flag & O_RDONLY) == O_RDONLY) --- 651,657 ---- if (STREQ(name, "-")) openfd = fileno(stdin); ! else if (STREQN(name, "/dev/", 5) && stat((char *) name, &buf) == -1) { cp = name + 5; if (STREQ(cp, "stdin") && (flag & O_RDONLY) == O_RDONLY) *************** *** 701,707 **** static int specfdopen(iop, name, mode) IOBUF *iop; ! char *name, *mode; { int fd; IOBUF *tp; --- 710,716 ---- static int specfdopen(iop, name, mode) IOBUF *iop; ! const char *name, *mode; { int fd; IOBUF *tp; *************** *** 748,754 **** static int pidopen(iop, name, mode) IOBUF *iop; ! char *name, *mode; { char tbuf[BUFSIZ]; int i; --- 757,763 ---- static int pidopen(iop, name, mode) IOBUF *iop; ! const char *name, *mode; { char tbuf[BUFSIZ]; int i; *************** *** 780,791 **** static int useropen(iop, name, mode) IOBUF *iop; ! char *name, *mode; { char tbuf[BUFSIZ], *cp; int i; #if defined(NGROUPS_MAX) && NGROUPS_MAX > 0 ! #if defined(atarist) gid_t groupset[NGROUPS_MAX]; #else int groupset[NGROUPS_MAX]; --- 789,800 ---- static int useropen(iop, name, mode) IOBUF *iop; ! const char *name, *mode; { char tbuf[BUFSIZ], *cp; int i; #if defined(NGROUPS_MAX) && NGROUPS_MAX > 0 ! #if defined(atarist) || defined(__svr4__) gid_t groupset[NGROUPS_MAX]; #else int groupset[NGROUPS_MAX]; *************** *** 821,836 **** static IOBUF * iop_open(name, mode) ! char *name, *mode; { int openfd = INVALID_HANDLE; int flag = 0; struct stat buf; IOBUF *iop; static struct internal { ! char *name; int compare; ! int (*fp)(); IOBUF iob; } table[] = { { "/dev/fd/", 8, specfdopen }, --- 830,845 ---- static IOBUF * iop_open(name, mode) ! const char *name, *mode; { int openfd = INVALID_HANDLE; int flag = 0; struct stat buf; IOBUF *iop; static struct internal { ! const char *name; int compare; ! int (*fp) P((IOBUF*,const char *,const char *)); IOBUF iob; } table[] = { { "/dev/fd/", 8, specfdopen }, *************** *** 851,862 **** if (STREQ(name, "-")) openfd = fileno(stdin); ! else if (STREQN(name, "/dev/", 5) && stat(name, &buf) == -1) { int i; for (i = 0; i < devcount; i++) { if (STREQN(name, table[i].name, table[i].compare)) { ! IOBUF *iop = & table[i].iob; if (iop->buf != NULL) { spec_setup(iop, 0, 0); --- 860,871 ---- if (STREQ(name, "-")) openfd = fileno(stdin); ! else if (STREQN(name, "/dev/", 5) && stat((char *) name, &buf) == -1) { int i; for (i = 0; i < devcount; i++) { if (STREQN(name, table[i].name, table[i].compare)) { ! iop = & table[i].iob; if (iop->buf != NULL) { spec_setup(iop, 0, 0); *************** *** 1005,1011 **** struct redirect *rp; { int rval, aval, fd = rp->iop->fd; ! FILE *kludge = fdopen(fd, "r"); /* pclose needs FILE* w/ right fileno */ rp->iop->fd = dup(fd); /* kludge to allow close() + pclose() */ rval = iop_close(rp->iop); --- 1014,1020 ---- struct redirect *rp; { int rval, aval, fd = rp->iop->fd; ! FILE *kludge = fdopen(fd, (char *) "r"); /* pclose needs FILE* w/ right fileno */ rp->iop->fd = dup(fd); /* kludge to allow close() + pclose() */ rval = iop_close(rp->iop); *************** *** 1013,1019 **** aval = pclose(kludge); return (rval < 0 ? rval : aval); } ! #else /* VMS */ static struct { --- 1022,1028 ---- aval = pclose(kludge); return (rval < 0 ? rval : aval); } ! #else /* VMS || OS2 || MSDOS */ static struct { *************** *** 1063,1069 **** free(pipes[cur].command); return rval; } ! #endif /* VMS */ #endif /* PIPES_SIMULATED */ --- 1072,1078 ---- free(pipes[cur].command); return rval; } ! #endif /* VMS || OS2 || MSDOS */ #endif /* PIPES_SIMULATED */ *************** *** 1088,1094 **** rp = redirect(tree->rnode, &redir_error); if (rp == NULL && redir_error) { /* failed redirect */ if (! do_unix) { ! char *s = strerror(redir_error); unref(ERRNO_node->var_value); ERRNO_node->var_value = --- 1097,1103 ---- rp = redirect(tree->rnode, &redir_error); if (rp == NULL && redir_error) { /* failed redirect */ if (! do_unix) { ! s = strerror(redir_error); unref(ERRNO_node->var_value); ERRNO_node->var_value = *************** *** 1103,1109 **** errcode = 0; cnt = get_a_record(&s, iop, *RS, & errcode); if (! do_unix && errcode != 0) { ! char *s = strerror(errcode); unref(ERRNO_node->var_value); ERRNO_node->var_value = make_string(s, strlen(s)); --- 1112,1118 ---- errcode = 0; cnt = get_a_record(&s, iop, *RS, & errcode); if (! do_unix && errcode != 0) { ! s = strerror(errcode); unref(ERRNO_node->var_value); ERRNO_node->var_value = make_string(s, strlen(s)); *************** *** 1149,1155 **** int pathopen (file) ! char *file; { int fd = do_pathopen(file); --- 1158,1164 ---- int pathopen (file) ! const char *file; { int fd = do_pathopen(file); *************** *** 1181,1192 **** static int do_pathopen (file) ! char *file; { ! static char *savepath = DEFPATH; /* defined in config.h */ static int first = 1; ! char *awkpath, *cp; ! char trypath[BUFSIZ]; int fd; if (STREQ(file, "-")) --- 1190,1201 ---- static int do_pathopen (file) ! const char *file; { ! static const char *savepath = DEFPATH; /* defined in config.h */ static int first = 1; ! const char *awkpath; ! char *cp, trypath[BUFSIZ]; int fd; if (STREQ(file, "-")) diff -crN gawk-2.15.3/iop.c gawk-2.15.4/iop.c *** gawk-2.15.3/iop.c Thu Apr 29 15:29:15 1993 --- gawk-2.15.4/iop.c Tue Jan 4 16:18:21 1994 *************** *** 3,9 **** */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 62,68 **** else if (fstat(fd, &stb) < 0) return 8*512; /* conservative in case of DECnet access */ else ! return 24*512; #else /* --- 62,68 ---- else if (fstat(fd, &stb) < 0) return 8*512; /* conservative in case of DECnet access */ else ! return 32*512; #else /* *************** *** 146,162 **** register char *bp = iop->off; char *bufend; char *start = iop->off; /* beginning of record */ - int saw_newline; char rs; ! int eat_whitespace; if (iop->cnt == EOF) /* previous read hit EOF */ return EOF; if (grRS == 0) { /* special case: grRS == "" */ rs = '\n'; - eat_whitespace = 0; - saw_newline = 0; } else rs = (char) grRS; --- 146,159 ---- register char *bp = iop->off; char *bufend; char *start = iop->off; /* beginning of record */ char rs; ! int saw_newline = 0, eat_whitespace = 0; /* used iff grRS==0 */ if (iop->cnt == EOF) /* previous read hit EOF */ return EOF; if (grRS == 0) { /* special case: grRS == "" */ rs = '\n'; } else rs = (char) grRS; diff -crN gawk-2.15.3/main.c gawk-2.15.4/main.c *** gawk-2.15.3/main.c Sun Nov 7 12:00:47 1993 --- gawk-2.15.4/main.c Tue Jan 4 17:10:36 1994 *************** *** 3,9 **** */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 137,143 **** extern int optind; extern int opterr; extern char *optarg; ! char *optlist = "+F:f:v:W:"; #ifdef __EMX__ _response(&argc, &argv); --- 137,144 ---- extern int optind; extern int opterr; extern char *optarg; ! const char *optlist = "+F:f:v:W:m:"; ! int stopped_early = 0; #ifdef __EMX__ _response(&argc, &argv); *************** *** 171,177 **** Nnull_string->flags = (PERM|STR|STRING|NUM|NUMBER); /* Set up the special variables */ - /* * Note that this must be done BEFORE arg parsing else -F * breaks horribly --- 172,177 ---- *************** *** 223,228 **** --- 223,241 ---- pre_assign(optarg); break; + case 'm': + /* + * Research awk extension. + * -mf=nnn set # fields, gawk ignores + * -mr=nnn set record length, ditto + */ + if (do_lint) + warning("-m[fr] option irrelevant"); + if ((optarg[0] != 'r' && optarg[0] != 'f') + || optarg[1] != '=') + warning("-m option usage: -m[fn]=nnn"); + break; + case 'W': /* gawk specific options */ gawk_option(optarg); break; *************** *** 255,260 **** --- 268,281 ---- break; #endif + case 0: + /* + * getopt_long found an option that sets a variable + * instead of returning a letter. Do nothing, just + * cycle around for the next one. + */ + break; + case '?': default: /* *************** *** 271,276 **** --- 292,298 ---- if (! do_posix && (optopt == 0 || strchr(optlist, optopt) == NULL)) { optind--; + stopped_early = 1; goto out; } else if (optopt) /* Use 1003.2 required message format */ *************** *** 298,304 **** output_is_tty = 1; /* No -f or --source options, use next arg */ if (numfiles == -1) { ! if (optind > argc - 1) /* no args left */ usage(1); srcfiles[++numfiles].stype = CMDLINE; srcfiles[numfiles].val = argv[optind]; --- 320,326 ---- output_is_tty = 1; /* No -f or --source options, use next arg */ if (numfiles == -1) { ! if (optind > argc - 1 || stopped_early) /* no args left or no program */ usage(1); srcfiles[++numfiles].stype = CMDLINE; srcfiles[numfiles].val = argv[optind]; *************** *** 338,353 **** usage(exitval) int exitval; { ! char *opt1 = " -f progfile [--]"; ! #if defined(MSDOS) || defined(OS2) ! char *opt2 = " [--] \"program\""; #else ! char *opt2 = " [--] 'program'"; #endif ! char *regops = " [POSIX or GNU style options]"; ! version(); ! fprintf(stderr, "Usage: %s%s%s file ...\n\t%s%s%s file ...\n", myname, regops, opt1, myname, regops, opt2); /* GNU long options info. Gack. */ --- 360,374 ---- usage(exitval) int exitval; { ! const char *opt1 = " -f progfile [--]"; ! #if defined(MSDOS) || defined(OS2) || defined(VMS) ! const char *opt2 = " [--] \"program\""; #else ! const char *opt2 = " [--] 'program'"; #endif ! const char *regops = " [POSIX or GNU style options]"; ! fprintf(stderr, "Usage:\t%s%s%s file ...\n\t%s%s%s file ...\n", myname, regops, opt1, myname, regops, opt2); /* GNU long options info. Gack. */ *************** *** 355,366 **** fputs("\t-f progfile\t\t--file=progfile\n", stderr); fputs("\t-F fs\t\t\t--field-separator=fs\n", stderr); fputs("\t-v var=val\t\t--assign=var=val\n", stderr); fputs("\t-W compat\t\t--compat\n", stderr); fputs("\t-W copyleft\t\t--copyleft\n", stderr); fputs("\t-W copyright\t\t--copyright\n", stderr); fputs("\t-W help\t\t\t--help\n", stderr); fputs("\t-W lint\t\t\t--lint\n", stderr); ! #if 0 fputs("\t-W nostalgia\t\t--nostalgia\n", stderr); #endif #ifdef DEBUG --- 376,388 ---- fputs("\t-f progfile\t\t--file=progfile\n", stderr); fputs("\t-F fs\t\t\t--field-separator=fs\n", stderr); fputs("\t-v var=val\t\t--assign=var=val\n", stderr); + fputs("\t-m[fr]=val\n", stderr); fputs("\t-W compat\t\t--compat\n", stderr); fputs("\t-W copyleft\t\t--copyleft\n", stderr); fputs("\t-W copyright\t\t--copyright\n", stderr); fputs("\t-W help\t\t\t--help\n", stderr); fputs("\t-W lint\t\t\t--lint\n", stderr); ! #ifdef NOSTALGIA fputs("\t-W nostalgia\t\t--nostalgia\n", stderr); #endif #ifdef DEBUG *************** *** 395,401 **** along with this program; if not, write to the Free Software\n\ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.\n"; - version(); fputs(blurb_part1, stderr); fputs(blurb_part2, stderr); fputs(blurb_part3, stderr); --- 417,422 ---- *************** *** 407,413 **** char *str; { register NODE **tmp; ! int len = strlen(str); tmp = get_lhs(FS_node, (Func_ptr *) 0); unref(*tmp); --- 428,435 ---- char *str; { register NODE **tmp; ! /* int len = strlen(str); *//* don't do that - we want to ! avoid mismatched types */ tmp = get_lhs(FS_node, (Func_ptr *) 0); unref(*tmp); *************** *** 424,430 **** if (do_unix && ! do_posix) str[0] = '\t'; } ! *tmp = make_str_node(str, len, SCAN); /* do process escapes */ set_FS(); } --- 446,452 ---- if (do_unix && ! do_posix) str[0] = '\t'; } ! *tmp = make_str_node(str, strlen(str), SCAN); /* do process escapes */ set_FS(); } *************** *** 456,464 **** */ struct varinit { NODE **spec; ! char *name; NODETYPE type; ! char *strval; AWKNUM numval; Func_ptr assign; }; --- 478,486 ---- */ struct varinit { NODE **spec; ! const char *name; NODETYPE type; ! const char *strval; AWKNUM numval; Func_ptr assign; }; *************** *** 489,497 **** register struct varinit *vp; for (vp = varinit; vp->name; vp++) { ! *(vp->spec) = install(vp->name, node(vp->strval == 0 ? make_number(vp->numval) ! : make_string(vp->strval, strlen(vp->strval)), vp->type, (NODE *) NULL)); if (vp->assign) (*(vp->assign))(); --- 511,520 ---- register struct varinit *vp; for (vp = varinit; vp->name; vp++) { ! *(vp->spec) = install((char *) vp->name, node(vp->strval == 0 ? make_number(vp->numval) ! : make_string((char *) vp->strval, ! strlen(vp->strval)), vp->type, (NODE *) NULL)); if (vp->assign) (*(vp->assign))(); *************** *** 727,732 **** --- 750,757 ---- version() { fprintf(stderr, "%s, patchlevel %d\n", version_string, PATCHLEVEL); + /* per GNU coding standards, exit successfully, do nothing else */ + exit(0); } /* this mess will improve in 2.16 */ diff -crN gawk-2.15.3/missing/strftime.c gawk-2.15.4/missing/strftime.c *** gawk-2.15.3/missing/strftime.c Thu Oct 21 22:51:40 1993 --- gawk-2.15.4/missing/strftime.c Tue Jan 4 16:18:24 1994 *************** *** 151,174 **** #endif /* POSIX_SEMANTICS */ /* various tables, useful in North America */ ! static char *days_a[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", }; ! static char *days_l[] = { "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", }; ! static char *months_a[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", }; ! static char *months_l[] = { "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", }; ! static char *ampm[] = { "AM", "PM", }; if (s == NULL || format == NULL || timeptr == NULL || maxsize == 0) return 0; --- 151,174 ---- #endif /* POSIX_SEMANTICS */ /* various tables, useful in North America */ ! static const char *days_a[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", }; ! static const char *days_l[] = { "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", }; ! static const char *months_a[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", }; ! static const char *months_l[] = { "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", }; ! static const char *ampm[] = { "AM", "PM", }; if (s == NULL || format == NULL || timeptr == NULL || maxsize == 0) return 0; diff -crN gawk-2.15.3/msg.c gawk-2.15.4/msg.c *** gawk-2.15.3/msg.c Mon Aug 10 14:34:20 1992 --- gawk-2.15.4/msg.c Tue Jan 4 16:18:22 1994 *************** *** 3,9 **** */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 31,38 **** /* VARARGS2 */ void err(s, emsg, argp) ! char *s; ! char *emsg; va_list argp; { char *file; --- 31,38 ---- /* VARARGS2 */ void err(s, emsg, argp) ! const char *s; ! const char *emsg; va_list argp; { char *file; *************** *** 49,56 **** } if (FNR) { file = FILENAME_node->var_value->stptr; if (file) ! (void) fprintf(stderr, "(FILENAME=%s ", file); (void) fprintf(stderr, "FNR=%d) ", FNR); } (void) fprintf(stderr, s); --- 49,57 ---- } if (FNR) { file = FILENAME_node->var_value->stptr; + (void) putc('(', stderr); if (file) ! (void) fprintf(stderr, "FILENAME=%s ", file); (void) fprintf(stderr, "FNR=%d) ", FNR); } (void) fprintf(stderr, s); diff -crN gawk-2.15.3/node.c gawk-2.15.4/node.c *** gawk-2.15.3/node.c Thu Jun 3 16:27:24 1993 --- gawk-2.15.4/node.c Tue Jan 4 16:18:23 1994 *************** *** 3,9 **** */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1986, 1988, 1989, 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 102,108 **** * (more complicated) variations on this theme didn't seem to pay off, but * systematic testing might be in order at some point */ ! static char *values[] = { "0", "1", "2", --- 102,108 ---- * (more complicated) variations on this theme didn't seem to pay off, but * systematic testing might be in order at some point */ ! static const char *values[] = { "0", "1", "2", *************** *** 137,143 **** num = (long)s->numbr; if ((AWKNUM) num == s->numbr) { /* integral value */ if (num < NVAL && num >= 0) { ! sp = values[num]; s->stlen = 1; } else { (void) sprintf(sp, "%ld", num); --- 137,143 ---- num = (long)s->numbr; if ((AWKNUM) num == s->numbr) { /* integral value */ if (num < NVAL && num >= 0) { ! sp = (char *) values[num]; s->stlen = 1; } else { (void) sprintf(sp, "%ld", num); *************** *** 145,151 **** } s->stfmt = -1; } else { ! (void) sprintf(sp, CONVFMT, s->numbr); s->stlen = strlen(sp); s->stfmt = (char)CONVFMTidx; } --- 145,151 ---- } s->stfmt = -1; } else { ! NUMTOSTR(sp, CONVFMT, s->numbr); s->stlen = strlen(sp); s->stfmt = (char)CONVFMTidx; } diff -crN gawk-2.15.3/patchlevel.h gawk-2.15.4/patchlevel.h *** gawk-2.15.3/patchlevel.h Sun Nov 7 10:45:11 1993 --- gawk-2.15.4/patchlevel.h Wed Jan 12 06:56:26 1994 *************** *** 1 **** ! #define PATCHLEVEL 3 --- 1 ---- ! #define PATCHLEVEL 4 diff -crN gawk-2.15.3/pc/Makefile.emx gawk-2.15.4/pc/Makefile.emx *** gawk-2.15.3/pc/Makefile.emx Tue Oct 19 21:58:04 1993 --- gawk-2.15.4/pc/Makefile.emx Sun Dec 19 16:35:58 1993 *************** *** 26,32 **** .SUFFIXES: $O .c .y .c$O: ! $(CC) $(CFLAGS) -DGAWK -c $< all: gawk.exe --- 26,32 ---- .SUFFIXES: $O .c .y .c$O: ! $(CC) $(CFLAGS) -DGAWK -DHAVE_CONFIG_H -c $< all: gawk.exe diff -crN gawk-2.15.3/pc/Makefile.msc gawk-2.15.4/pc/Makefile.msc *** gawk-2.15.3/pc/Makefile.msc Sun Nov 7 10:42:05 1993 --- gawk-2.15.4/pc/Makefile.msc Sun Dec 19 16:36:49 1993 *************** *** 38,44 **** .SUFFIXES: $O .c .y .c$O: ! $(CC) $(CFLAGS) -DGAWK -c $< all: gawk.exe --- 38,44 ---- .SUFFIXES: $O .c .y .c$O: ! $(CC) $(CFLAGS) -DGAWK -DHAVE_CONFIG_H -c $< all: gawk.exe diff -crN gawk-2.15.3/pc/Makefile.os2 gawk-2.15.4/pc/Makefile.os2 *** gawk-2.15.3/pc/Makefile.os2 Tue Oct 19 21:56:26 1993 --- gawk-2.15.4/pc/Makefile.os2 Sun Dec 19 16:35:33 1993 *************** *** 98,104 **** .SUFFIXES: $O .c .y .c$O: ! $(CC) $(CFLAGS) -DGAWK -c $< all: gawk.exe --- 98,104 ---- .SUFFIXES: $O .c .y .c$O: ! $(CC) $(CFLAGS) -DGAWK -DHAVE_CONFIG_H -c $< all: gawk.exe diff -crN gawk-2.15.3/pc/config.h gawk-2.15.4/pc/config.h *** gawk-2.15.3/pc/config.h Sun Nov 7 10:37:33 1993 --- gawk-2.15.4/pc/config.h Wed Dec 29 10:44:38 1993 *************** *** 5,11 **** */ /* ! * Copyright (C) 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 5,11 ---- */ /* ! * Copyright (C) 1991-1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. diff -crN gawk-2.15.3/pc/makegawk.bat gawk-2.15.4/pc/makegawk.bat *** gawk-2.15.3/pc/makegawk.bat Fri Sep 17 10:37:48 1993 --- gawk-2.15.4/pc/makegawk.bat Wed Dec 29 11:17:12 1993 *************** *** 6,12 **** REM Modified by Scott Deifik, July, 1992, Sep 1993 REM Based on earlier makefile for dos REM ! REM Copyright (C) 1986, 1988, 1989, 1991 the Free Software Foundation, Inc. REM REM This file is part of GAWK, the GNU implementation of the REM AWK Progamming Language. --- 6,12 ---- REM Modified by Scott Deifik, July, 1992, Sep 1993 REM Based on earlier makefile for dos REM ! REM Copyright (C) 1986, 1988, 1989, 1991, 1993 the Free Software Foundation, Inc. REM REM This file is part of GAWK, the GNU implementation of the REM AWK Progamming Language. *************** *** 42,53 **** cl -Za -c -AL %CFLAGS% -DGAWK array.c cl -Za -c -AL %CFLAGS% -DGAWK awktab.c cl -Za -c -AL %CFLAGS% -DGAWK builtin.c ! cl -Za -c -AL %CFLAGS% -DGAWK dfa.c cl -Za -c -AL %CFLAGS% -DGAWK eval.c cl -Za -c -AL %CFLAGS% -DGAWK field.c cl -Za -c -AL %CFLAGS% -DGAWK getid.c ! cl -Za -c -AL %CFLAGS% -DGAWK getopt.c ! cl -Za -c -AL %CFLAGS% -DGAWK getopt1.c cl -Za -c -AL %CFLAGS% -DGAWK io.c cl -Za -c -AL %CFLAGS% -DGAWK iop.c cl -Za -c -AL %CFLAGS% -DGAWK main.c --- 42,53 ---- cl -Za -c -AL %CFLAGS% -DGAWK array.c cl -Za -c -AL %CFLAGS% -DGAWK awktab.c cl -Za -c -AL %CFLAGS% -DGAWK builtin.c ! cl -Za -c -AL %CFLAGS% -DGAWK -DHAVE_CONFIG_H dfa.c cl -Za -c -AL %CFLAGS% -DGAWK eval.c cl -Za -c -AL %CFLAGS% -DGAWK field.c cl -Za -c -AL %CFLAGS% -DGAWK getid.c ! cl -Za -c -AL %CFLAGS% -DGAWK -DHAVE_CONFIG_H getopt.c ! cl -Za -c -AL %CFLAGS% -DGAWK -DHAVE_CONFIG_H getopt1.c cl -Za -c -AL %CFLAGS% -DGAWK io.c cl -Za -c -AL %CFLAGS% -DGAWK iop.c cl -Za -c -AL %CFLAGS% -DGAWK main.c *************** *** 57,63 **** cl -Za -c -AL %CFLAGS% -DGAWK popen.c cl -Za -c -AL %CFLAGS% -DGAWK re.c REM You can ignore the warnings you will get ! cl -Za -c -AL %CFLAGS% -DGAWK regex.c cl -Za -c -AL %CFLAGS% -DGAWK version.c REM REM link debug flags: /CO /NOE /NOI /st:30000 --- 57,63 ---- cl -Za -c -AL %CFLAGS% -DGAWK popen.c cl -Za -c -AL %CFLAGS% -DGAWK re.c REM You can ignore the warnings you will get ! cl -Za -c -AL %CFLAGS% -DGAWK -DHAVE_CONFIG_H regex.c cl -Za -c -AL %CFLAGS% -DGAWK version.c REM REM link debug flags: /CO /NOE /NOI /st:30000 diff -crN gawk-2.15.3/protos.h gawk-2.15.4/protos.h *** gawk-2.15.3/protos.h Tue Apr 27 21:10:18 1993 --- gawk-2.15.4/protos.h Wed Dec 29 10:35:41 1993 *************** *** 3,9 **** */ /* ! * Copyright (C) 1991, 1992, the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 51,57 **** extern int strlen P((const char *)); extern long strtol P((const char *, char **, int)); #if !defined(_MSC_VER) && !defined(__GNU_LIBRARY__) ! extern int strftime P((char *, int, const char *, const struct tm *)); #endif extern time_t time P((time_t *)); extern aptr_t memset P((aptr_t, int, size_t)); --- 51,57 ---- extern int strlen P((const char *)); extern long strtol P((const char *, char **, int)); #if !defined(_MSC_VER) && !defined(__GNU_LIBRARY__) ! extern size_t strftime P((char *, size_t, const char *, const struct tm *)); #endif extern time_t time P((time_t *)); extern aptr_t memset P((aptr_t, int, size_t)); *************** *** 60,69 **** extern aptr_t memchr P((const aptr_t, int, size_t)); extern int memcmp P((const aptr_t, const aptr_t, size_t)); ! /* extern int fprintf P((FILE *, char *, ...)); */ ! extern int fprintf P(()); #if !defined(MSDOS) && !defined(__GNU_LIBRARY__) ! extern int fwrite P((const char *, int, int, FILE *)); extern int fputs P((const char *, FILE *)); extern int unlink P((const char *)); #endif --- 60,68 ---- extern aptr_t memchr P((const aptr_t, int, size_t)); extern int memcmp P((const aptr_t, const aptr_t, size_t)); ! extern int fprintf P((FILE *, const char *, ...)); #if !defined(MSDOS) && !defined(__GNU_LIBRARY__) ! extern size_t fwrite P((const void *, size_t, size_t, FILE *)); extern int fputs P((const char *, FILE *)); extern int unlink P((const char *)); #endif *************** *** 75,81 **** extern int isatty P((int)); extern void exit P((int)); extern int system P((const char *)); ! extern int sscanf P((/* char *, char *, ... */)); #ifndef toupper extern int toupper P((int)); #endif --- 74,80 ---- extern int isatty P((int)); extern void exit P((int)); extern int system P((const char *)); ! extern int sscanf P((const char *, const char *, ...)); #ifndef toupper extern int toupper P((int)); #endif *************** *** 91,98 **** extern off_t lseek P((int, off_t, int)); extern int fseek P((FILE *, long, int)); extern int close P((int)); ! extern int creat P(()); ! extern int open P(()); extern int pipe P((int *)); extern int dup P((int)); extern int dup2 P((int,int)); --- 90,97 ---- extern off_t lseek P((int, off_t, int)); extern int fseek P((FILE *, long, int)); extern int close P((int)); ! extern int creat P((const char *, mode_t)); ! extern int open P((const char *, int, ...)); extern int pipe P((int *)); extern int dup P((int)); extern int dup2 P((int,int)); diff -crN gawk-2.15.3/re.c gawk-2.15.4/re.c *** gawk-2.15.3/re.c Tue Nov 2 06:30:54 1993 --- gawk-2.15.4/re.c Sat Jan 15 22:32:28 1994 *************** *** 3,9 **** */ /* ! * Copyright (C) 1991, 1992 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1991, 1992, 1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 30,41 **** Regexp * make_regexp(s, len, ignorecase, dfa) char *s; ! int len; int ignorecase; int dfa; { Regexp *rp; ! char *err; char *src = s; char *temp; char *end = s + len; --- 30,41 ---- Regexp * make_regexp(s, len, ignorecase, dfa) char *s; ! size_t len; int ignorecase; int dfa; { Regexp *rp; ! const char *rerr; char *src = s; char *temp; char *end = s + len; *************** *** 90,96 **** *dest = '\0' ; /* Only necessary if we print dest ? */ emalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); memset((char *) rp, 0, sizeof(*rp)); ! emalloc(rp->pat.buffer, char *, 16, "make_regexp"); rp->pat.allocated = 16; emalloc(rp->pat.fastmap, char *, 256, "make_regexp"); --- 90,96 ---- *dest = '\0' ; /* Only necessary if we print dest ? */ emalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); memset((char *) rp, 0, sizeof(*rp)); ! emalloc(rp->pat.buffer, unsigned char *, 16, "make_regexp"); rp->pat.allocated = 16; emalloc(rp->pat.fastmap, char *, 256, "make_regexp"); *************** *** 99,111 **** else rp->pat.translate = NULL; len = dest - temp; ! if ((err = re_compile_pattern(temp, len, &(rp->pat))) != NULL) ! fatal("%s: /%s/", err, temp); if (dfa && !ignorecase) { ! regcompile(temp, len, &(rp->dfareg), 1); rp->dfa = 1; } else rp->dfa = 0; free(temp); return rp; } --- 99,112 ---- else rp->pat.translate = NULL; len = dest - temp; ! if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL) ! fatal("%s: /%s/", rerr, temp); if (dfa && !ignorecase) { ! dfacomp(temp, len, &(rp->dfareg), 1); rp->dfa = 1; } else rp->dfa = 0; + free(temp); return rp; } *************** *** 115,138 **** Regexp *rp; register char *str; int start; ! register int len; int need_start; { char *ret = str; if (rp->dfa) { ! char save1; ! char save2; int count = 0; int try_backref; ! save1 = str[start+len]; ! str[start+len] = '\n'; ! save2 = str[start+len+1]; ! ret = regexecute(&(rp->dfareg), str+start, str+start+len+1, 1, &count, &try_backref); ! str[start+len] = save1; ! str[start+len+1] = save2; } if (ret) { if (need_start || rp->dfa == 0) --- 116,139 ---- Regexp *rp; register char *str; int start; ! register size_t len; int need_start; { char *ret = str; if (rp->dfa) { ! char save; int count = 0; int try_backref; ! /* ! * dfa likes to stick a '\n' right after the matched ! * text. So we just save and restore the character. ! */ ! save = str[start+len]; ! ret = dfaexec(&(rp->dfareg), str+start, str+start+len, 1, &count, &try_backref); ! str[start+len] = save; } if (ret) { if (need_start || rp->dfa == 0) *************** *** 151,162 **** free(rp->pat.buffer); free(rp->pat.fastmap); if (rp->dfa) ! reg_free(&(rp->dfareg)); free(rp); } void ! reg_error(s) const char *s; { fatal(s); --- 152,163 ---- free(rp->pat.buffer); free(rp->pat.fastmap); if (rp->dfa) ! dfafree(&(rp->dfareg)); free(rp); } void ! dfaerror(s) const char *s; { fatal(s); *************** *** 194,200 **** t->re_text = dupnode(t1); free_temp(t1); } ! t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen, IGNORECASE, t->re_cnt); t->re_flags &= ~CASE; t->re_flags |= IGNORECASE; return t->re_reg; --- 195,202 ---- t->re_text = dupnode(t1); free_temp(t1); } ! t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen, ! IGNORECASE, t->re_cnt); t->re_flags &= ~CASE; t->re_flags |= IGNORECASE; return t->re_reg; *************** *** 203,208 **** void resetup() { ! (void) re_set_syntax(RE_SYNTAX_AWK); ! regsyntax(RE_SYNTAX_AWK, 0); } --- 205,212 ---- void resetup() { ! reg_syntax_t syn = RE_SYNTAX_AWK; ! ! (void) re_set_syntax(syn); ! dfasyntax(syn, 0); } diff -crN gawk-2.15.3/regex.c gawk-2.15.4/regex.c *** gawk-2.15.3/regex.c Wed Dec 31 19:00:00 1969 --- gawk-2.15.4/regex.c Tue Jan 11 06:34:34 1994 *************** *** 0 **** --- 1,5070 ---- + /* Extended regular expression matching and search library, + version 0.12. + (Implements POSIX draft P10003.2/D11.2, except for + internationalization features.) + + Copyright (C) 1993 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + + /* AIX requires this to be the first thing in the file. */ + #if defined (_AIX) && !defined (REGEX_MALLOC) + #pragma alloca + #endif + + #define _GNU_SOURCE + + #ifdef HAVE_CONFIG_H + #include "config.h" + #endif + + #if defined(STDC_HEADERS) && !defined(emacs) + #include + #else + /* We need this for `regex.h', and perhaps for the Emacs include files. */ + #include + #endif + + /* The `emacs' switch turns on certain matching commands + that make sense only in Emacs. */ + #ifdef emacs + + #include "lisp.h" + #include "buffer.h" + #include "syntax.h" + + /* Emacs uses `NULL' as a predicate. */ + #undef NULL + + #else /* not emacs */ + + /* We used to test for `BSTRING' here, but only GCC and Emacs define + `BSTRING', as far as I know, and neither of them use this code. */ + #if HAVE_STRING_H || STDC_HEADERS + #include + #ifndef bcmp + #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) + #endif + #ifndef bcopy + #define bcopy(s, d, n) memcpy ((d), (s), (n)) + #endif + #ifndef bzero + #define bzero(s, n) memset ((s), 0, (n)) + #endif + #else + #include + #endif + + #ifdef STDC_HEADERS + #include + #else + char *malloc (); + char *realloc (); + #endif + + + /* Define the syntax stuff for \<, \>, etc. */ + + /* This must be nonzero for the wordchar and notwordchar pattern + commands in re_match_2. */ + #ifndef Sword + #define Sword 1 + #endif + + #ifdef SYNTAX_TABLE + + extern char *re_syntax_table; + + #else /* not SYNTAX_TABLE */ + + /* How many characters in the character set. */ + #define CHAR_SET_SIZE 256 + + static char re_syntax_table[CHAR_SET_SIZE]; + + static void + init_syntax_once () + { + register int c; + static int done = 0; + + if (done) + return; + + bzero (re_syntax_table, sizeof re_syntax_table); + + for (c = 'a'; c <= 'z'; c++) + re_syntax_table[c] = Sword; + + for (c = 'A'; c <= 'Z'; c++) + re_syntax_table[c] = Sword; + + for (c = '0'; c <= '9'; c++) + re_syntax_table[c] = Sword; + + re_syntax_table['_'] = Sword; + + done = 1; + } + + #endif /* not SYNTAX_TABLE */ + + #define SYNTAX(c) re_syntax_table[c] + + #endif /* not emacs */ + + /* Get the interface, including the syntax bits. */ + #include "regex.h" + + /* isalpha etc. are used for the character classes. */ + #include + + /* Jim Meyering writes: + + "... Some ctype macros are valid only for character codes that + isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when + using /bin/cc or gcc but without giving an ansi option). So, all + ctype uses should be through macros like ISPRINT... If + STDC_HEADERS is defined, then autoconf has verified that the ctype + macros don't need to be guarded with references to isascii. ... + Defining isascii to 1 should let any compiler worth its salt + eliminate the && through constant folding." */ + #if ! defined (isascii) || defined (STDC_HEADERS) + #undef isascii + #define isascii(c) 1 + #endif + + #ifdef isblank + #define ISBLANK(c) (isascii (c) && isblank (c)) + #else + #define ISBLANK(c) ((c) == ' ' || (c) == '\t') + #endif + #ifdef isgraph + #define ISGRAPH(c) (isascii (c) && isgraph (c)) + #else + #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c)) + #endif + + #define ISPRINT(c) (isascii (c) && isprint (c)) + #define ISDIGIT(c) (isascii (c) && isdigit (c)) + #define ISALNUM(c) (isascii (c) && isalnum (c)) + #define ISALPHA(c) (isascii (c) && isalpha (c)) + #define ISCNTRL(c) (isascii (c) && iscntrl (c)) + #define ISLOWER(c) (isascii (c) && islower (c)) + #define ISPUNCT(c) (isascii (c) && ispunct (c)) + #define ISSPACE(c) (isascii (c) && isspace (c)) + #define ISUPPER(c) (isascii (c) && isupper (c)) + #define ISXDIGIT(c) (isascii (c) && isxdigit (c)) + + #ifndef NULL + #define NULL 0 + #endif + + /* We remove any previous definition of `SIGN_EXTEND_CHAR', + since ours (we hope) works properly with all combinations of + machines, compilers, `char' and `unsigned char' argument types. + (Per Bothner suggested the basic approach.) */ + #undef SIGN_EXTEND_CHAR + #if __STDC__ + #define SIGN_EXTEND_CHAR(c) ((signed char) (c)) + #else /* not __STDC__ */ + /* As in Harbison and Steele. */ + #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) + #endif + + /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we + use `alloca' instead of `malloc'. This is because using malloc in + re_search* or re_match* could cause memory leaks when C-g is used in + Emacs; also, malloc is slower and causes storage fragmentation. On + the other hand, malloc is more portable, and easier to debug. + + Because we sometimes use alloca, some routines have to be macros, + not functions -- `alloca'-allocated space disappears at the end of the + function it is called in. */ + + #ifdef REGEX_MALLOC + + #define REGEX_ALLOCATE malloc + #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) + + #else /* not REGEX_MALLOC */ + + /* Emacs already defines alloca, sometimes. */ + #ifndef alloca + + /* Make alloca work the best possible way. */ + #ifdef __GNUC__ + #define alloca __builtin_alloca + #else /* not __GNUC__ */ + #if HAVE_ALLOCA_H + #include + #else /* not __GNUC__ or HAVE_ALLOCA_H */ + #ifndef _AIX /* Already did AIX, up at the top. */ + char *alloca (); + #endif /* not _AIX */ + #endif /* not HAVE_ALLOCA_H */ + #endif /* not __GNUC__ */ + + #endif /* not alloca */ + + #define REGEX_ALLOCATE alloca + + /* Assumes a `char *destination' variable. */ + #define REGEX_REALLOCATE(source, osize, nsize) \ + (destination = (char *) alloca (nsize), \ + bcopy (source, destination, osize), \ + destination) + + #endif /* not REGEX_MALLOC */ + + + /* True if `size1' is non-NULL and PTR is pointing anywhere inside + `string1' or just past its end. This works if PTR is NULL, which is + a good thing. */ + #define FIRST_STRING_P(ptr) \ + (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) + + /* (Re)Allocate N items of type T using malloc, or fail. */ + #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) + #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) + #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) + + #define BYTEWIDTH 8 /* In bits. */ + + #define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) + + #define MAX(a, b) ((a) > (b) ? (a) : (b)) + #define MIN(a, b) ((a) < (b) ? (a) : (b)) + + typedef char boolean; + #define false 0 + #define true 1 + + /* These are the command codes that appear in compiled regular + expressions. Some opcodes are followed by argument bytes. A + command code can specify any interpretation whatsoever for its + arguments. Zero bytes may appear in the compiled regular expression. + + The value of `exactn' is needed in search.c (search_buffer) in Emacs. + So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of + `exactn' we use here must also be 1. */ + + typedef enum + { + no_op = 0, + + /* Followed by one byte giving n, then by n literal bytes. */ + exactn = 1, + + /* Matches any (more or less) character. */ + anychar, + + /* Matches any one char belonging to specified set. First + following byte is number of bitmap bytes. Then come bytes + for a bitmap saying which chars are in. Bits in each byte + are ordered low-bit-first. A character is in the set if its + bit is 1. A character too large to have a bit in the map is + automatically not in the set. */ + charset, + + /* Same parameters as charset, but match any character that is + not one of those specified. */ + charset_not, + + /* Start remembering the text that is matched, for storing in a + register. Followed by one byte with the register number, in + the range 0 to one less than the pattern buffer's re_nsub + field. Then followed by one byte with the number of groups + inner to this one. (This last has to be part of the + start_memory only because we need it in the on_failure_jump + of re_match_2.) */ + start_memory, + + /* Stop remembering the text that is matched and store it in a + memory register. Followed by one byte with the register + number, in the range 0 to one less than `re_nsub' in the + pattern buffer, and one byte with the number of inner groups, + just like `start_memory'. (We need the number of inner + groups here because we don't have any easy way of finding the + corresponding start_memory when we're at a stop_memory.) */ + stop_memory, + + /* Match a duplicate of something remembered. Followed by one + byte containing the register number. */ + duplicate, + + /* Fail unless at beginning of line. */ + begline, + + /* Fail unless at end of line. */ + endline, + + /* Succeeds if at beginning of buffer (if emacs) or at beginning + of string to be matched (if not). */ + begbuf, + + /* Analogously, for end of buffer/string. */ + endbuf, + + /* Followed by two byte relative address to which to jump. */ + jump, + + /* Same as jump, but marks the end of an alternative. */ + jump_past_alt, + + /* Followed by two-byte relative address of place to resume at + in case of failure. */ + on_failure_jump, + + /* Like on_failure_jump, but pushes a placeholder instead of the + current string position when executed. */ + on_failure_keep_string_jump, + + /* Throw away latest failure point and then jump to following + two-byte relative address. */ + pop_failure_jump, + + /* Change to pop_failure_jump if know won't have to backtrack to + match; otherwise change to jump. This is used to jump + back to the beginning of a repeat. If what follows this jump + clearly won't match what the repeat does, such that we can be + sure that there is no use backtracking out of repetitions + already matched, then we change it to a pop_failure_jump. + Followed by two-byte address. */ + maybe_pop_jump, + + /* Jump to following two-byte address, and push a dummy failure + point. This failure point will be thrown away if an attempt + is made to use it for a failure. A `+' construct makes this + before the first repeat. Also used as an intermediary kind + of jump when compiling an alternative. */ + dummy_failure_jump, + + /* Push a dummy failure point and continue. Used at the end of + alternatives. */ + push_dummy_failure, + + /* Followed by two-byte relative address and two-byte number n. + After matching N times, jump to the address upon failure. */ + succeed_n, + + /* Followed by two-byte relative address, and two-byte number n. + Jump to the address N times, then fail. */ + jump_n, + + /* Set the following two-byte relative address to the + subsequent two-byte number. The address *includes* the two + bytes of number. */ + set_number_at, + + wordchar, /* Matches any word-constituent character. */ + notwordchar, /* Matches any char that is not a word-constituent. */ + + wordbeg, /* Succeeds if at word beginning. */ + wordend, /* Succeeds if at word end. */ + + wordbound, /* Succeeds if at a word boundary. */ + notwordbound /* Succeeds if not at a word boundary. */ + + #ifdef emacs + ,before_dot, /* Succeeds if before point. */ + at_dot, /* Succeeds if at point. */ + after_dot, /* Succeeds if after point. */ + + /* Matches any character whose syntax is specified. Followed by + a byte which contains a syntax code, e.g., Sword. */ + syntaxspec, + + /* Matches any character whose syntax is not that specified. */ + notsyntaxspec + #endif /* emacs */ + } re_opcode_t; + + /* Common operations on the compiled pattern. */ + + /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ + + #define STORE_NUMBER(destination, number) \ + do { \ + (destination)[0] = (number) & 0377; \ + (destination)[1] = (number) >> 8; \ + } while (0) + + /* Same as STORE_NUMBER, except increment DESTINATION to + the byte after where the number is stored. Therefore, DESTINATION + must be an lvalue. */ + + #define STORE_NUMBER_AND_INCR(destination, number) \ + do { \ + STORE_NUMBER (destination, number); \ + (destination) += 2; \ + } while (0) + + /* Put into DESTINATION a number stored in two contiguous bytes starting + at SOURCE. */ + + #define EXTRACT_NUMBER(destination, source) \ + do { \ + (destination) = *(source) & 0377; \ + (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ + } while (0) + + #ifdef DEBUG + static void extract_number _RE_ARGS((int *dest, unsigned char *source)); + static void + extract_number (dest, source) + int *dest; + unsigned char *source; + { + int temp = SIGN_EXTEND_CHAR (*(source + 1)); + *dest = *source & 0377; + *dest += temp << 8; + } + + #ifndef EXTRACT_MACROS /* To debug the macros. */ + #undef EXTRACT_NUMBER + #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) + #endif /* not EXTRACT_MACROS */ + + #endif /* DEBUG */ + + /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. + SOURCE must be an lvalue. */ + + #define EXTRACT_NUMBER_AND_INCR(destination, source) \ + do { \ + EXTRACT_NUMBER (destination, source); \ + (source) += 2; \ + } while (0) + + #ifdef DEBUG + static void extract_number_and_incr _RE_ARGS((int *destination, + unsigned char **source)); + static void + extract_number_and_incr (destination, source) + int *destination; + unsigned char **source; + { + extract_number (destination, *source); + *source += 2; + } + + #ifndef EXTRACT_MACROS + #undef EXTRACT_NUMBER_AND_INCR + #define EXTRACT_NUMBER_AND_INCR(dest, src) \ + extract_number_and_incr (&dest, &src) + #endif /* not EXTRACT_MACROS */ + + #endif /* DEBUG */ + + /* If DEBUG is defined, Regex prints many voluminous messages about what + it is doing (if the variable `debug' is nonzero). If linked with the + main program in `iregex.c', you can enter patterns and strings + interactively. And if linked with the main program in `main.c' and + the other test files, you can run the already-written tests. */ + + #ifdef DEBUG + + /* We use standard I/O for debugging. */ + #include + + /* It is useful to test things that ``must'' be true when debugging. */ + #include + + static int debug = 0; + + #define DEBUG_STATEMENT(e) e + #define DEBUG_PRINT1(x) if (debug) printf (x) + #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) + #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) + #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) + #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ + if (debug) print_partial_compiled_pattern (s, e) + #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ + if (debug) print_double_string (w, s1, sz1, s2, sz2) + + + extern void printchar (); + + /* Print the fastmap in human-readable form. */ + + void + print_fastmap (fastmap) + char *fastmap; + { + unsigned was_a_range = 0; + unsigned i = 0; + + while (i < (1 << BYTEWIDTH)) + { + if (fastmap[i++]) + { + was_a_range = 0; + printchar (i - 1); + while (i < (1 << BYTEWIDTH) && fastmap[i]) + { + was_a_range = 1; + i++; + } + if (was_a_range) + { + printf ("-"); + printchar (i - 1); + } + } + } + putchar ('\n'); + } + + + /* Print a compiled pattern string in human-readable form, starting at + the START pointer into it and ending just before the pointer END. */ + + void + print_partial_compiled_pattern (start, end) + unsigned char *start; + unsigned char *end; + { + int mcnt, mcnt2; + unsigned char *p = start; + unsigned char *pend = end; + + if (start == NULL) + { + printf ("(null)\n"); + return; + } + + /* Loop over pattern commands. */ + while (p < pend) + { + printf ("%d:\t", p - start); + + switch ((re_opcode_t) *p++) + { + case no_op: + printf ("/no_op"); + break; + + case exactn: + mcnt = *p++; + printf ("/exactn/%d", mcnt); + do + { + putchar ('/'); + printchar (*p++); + } + while (--mcnt); + break; + + case start_memory: + mcnt = *p++; + printf ("/start_memory/%d/%d", mcnt, *p++); + break; + + case stop_memory: + mcnt = *p++; + printf ("/stop_memory/%d/%d", mcnt, *p++); + break; + + case duplicate: + printf ("/duplicate/%d", *p++); + break; + + case anychar: + printf ("/anychar"); + break; + + case charset: + case charset_not: + { + register int c, last = -100; + register int in_range = 0; + + printf ("/charset [%s", + (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); + + assert (p + *p < pend); + + for (c = 0; c < 256; c++) + if (c / 8 < *p + && (p[1 + (c/8)] & (1 << (c % 8)))) + { + /* Are we starting a range? */ + if (last + 1 == c && ! in_range) + { + putchar ('-'); + in_range = 1; + } + /* Have we broken a range? */ + else if (last + 1 != c && in_range) + { + printchar (last); + in_range = 0; + } + + if (! in_range) + printchar (c); + + last = c; + } + + if (in_range) + printchar (last); + + putchar (']'); + + p += 1 + *p; + } + break; + + case begline: + printf ("/begline"); + break; + + case endline: + printf ("/endline"); + break; + + case on_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/on_failure_jump to %d", p + mcnt - start); + break; + + case on_failure_keep_string_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); + break; + + case dummy_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/dummy_failure_jump to %d", p + mcnt - start); + break; + + case push_dummy_failure: + printf ("/push_dummy_failure"); + break; + + case maybe_pop_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/maybe_pop_jump to %d", p + mcnt - start); + break; + + case pop_failure_jump: + extract_number_and_incr (&mcnt, &p); + printf ("/pop_failure_jump to %d", p + mcnt - start); + break; + + case jump_past_alt: + extract_number_and_incr (&mcnt, &p); + printf ("/jump_past_alt to %d", p + mcnt - start); + break; + + case jump: + extract_number_and_incr (&mcnt, &p); + printf ("/jump to %d", p + mcnt - start); + break; + + case succeed_n: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/succeed_n to %d, %d times", p + mcnt - start, mcnt2); + break; + + case jump_n: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/jump_n to %d, %d times", p + mcnt - start, mcnt2); + break; + + case set_number_at: + extract_number_and_incr (&mcnt, &p); + extract_number_and_incr (&mcnt2, &p); + printf ("/set_number_at location %d to %d", p + mcnt - start, mcnt2); + break; + + case wordbound: + printf ("/wordbound"); + break; + + case notwordbound: + printf ("/notwordbound"); + break; + + case wordbeg: + printf ("/wordbeg"); + break; + + case wordend: + printf ("/wordend"); + + #ifdef emacs + case before_dot: + printf ("/before_dot"); + break; + + case at_dot: + printf ("/at_dot"); + break; + + case after_dot: + printf ("/after_dot"); + break; + + case syntaxspec: + printf ("/syntaxspec"); + mcnt = *p++; + printf ("/%d", mcnt); + break; + + case notsyntaxspec: + printf ("/notsyntaxspec"); + mcnt = *p++; + printf ("/%d", mcnt); + break; + #endif /* emacs */ + + case wordchar: + printf ("/wordchar"); + break; + + case notwordchar: + printf ("/notwordchar"); + break; + + case begbuf: + printf ("/begbuf"); + break; + + case endbuf: + printf ("/endbuf"); + break; + + default: + printf ("?%d", *(p-1)); + } + + putchar ('\n'); + } + + printf ("%d:\tend of pattern.\n", p - start); + } + + + void + print_compiled_pattern (bufp) + struct re_pattern_buffer *bufp; + { + unsigned char *buffer = bufp->buffer; + + print_partial_compiled_pattern (buffer, buffer + bufp->used); + printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); + + if (bufp->fastmap_accurate && bufp->fastmap) + { + printf ("fastmap: "); + print_fastmap (bufp->fastmap); + } + + printf ("re_nsub: %d\t", bufp->re_nsub); + printf ("regs_alloc: %d\t", bufp->regs_allocated); + printf ("can_be_null: %d\t", bufp->can_be_null); + printf ("newline_anchor: %d\n", bufp->newline_anchor); + printf ("no_sub: %d\t", bufp->no_sub); + printf ("not_bol: %d\t", bufp->not_bol); + printf ("not_eol: %d\t", bufp->not_eol); + printf ("syntax: %d\n", bufp->syntax); + /* Perhaps we should print the translate table? */ + } + + + void + print_double_string (where, string1, size1, string2, size2) + const char *where; + const char *string1; + const char *string2; + int size1; + int size2; + { + unsigned this_char; + + if (where == NULL) + printf ("(null)"); + else + { + if (FIRST_STRING_P (where)) + { + for (this_char = where - string1; this_char < size1; this_char++) + printchar (string1[this_char]); + + where = string2; + } + + for (this_char = where - string2; this_char < size2; this_char++) + printchar (string2[this_char]); + } + } + + #else /* not DEBUG */ + + #undef assert + #define assert(e) + + #define DEBUG_STATEMENT(e) + #define DEBUG_PRINT1(x) + #define DEBUG_PRINT2(x1, x2) + #define DEBUG_PRINT3(x1, x2, x3) + #define DEBUG_PRINT4(x1, x2, x3, x4) + #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) + #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) + + #endif /* not DEBUG */ + + /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can + also be assigned to arbitrarily: each pattern buffer stores its own + syntax, so it can be changed between regex compilations. */ + reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS; + + + /* Specify the precise syntax of regexps for compilation. This provides + for compatibility for various utilities which historically have + different, incompatible syntaxes. + + The argument SYNTAX is a bit mask comprised of the various bits + defined in regex.h. We return the old syntax. */ + + reg_syntax_t + re_set_syntax (syntax) + reg_syntax_t syntax; + { + reg_syntax_t ret = re_syntax_options; + + re_syntax_options = syntax; + return ret; + } + + /* This table gives an error message for each of the error codes listed + in regex.h. Obviously the order here has to be same as there. */ + + static const char *re_error_msg[] = + { NULL, /* REG_NOERROR */ + "No match", /* REG_NOMATCH */ + "Invalid regular expression", /* REG_BADPAT */ + "Invalid collation character", /* REG_ECOLLATE */ + "Invalid character class name", /* REG_ECTYPE */ + "Trailing backslash", /* REG_EESCAPE */ + "Invalid back reference", /* REG_ESUBREG */ + "Unmatched [ or [^", /* REG_EBRACK */ + "Unmatched ( or \\(", /* REG_EPAREN */ + "Unmatched \\{", /* REG_EBRACE */ + "Invalid content of \\{\\}", /* REG_BADBR */ + "Invalid range end", /* REG_ERANGE */ + "Memory exhausted", /* REG_ESPACE */ + "Invalid preceding regular expression", /* REG_BADRPT */ + "Premature end of regular expression", /* REG_EEND */ + "Regular expression too big", /* REG_ESIZE */ + "Unmatched ) or \\)", /* REG_ERPAREN */ + }; + + /* Subroutine declarations and macros for regex_compile. */ + + static reg_errcode_t regex_compile _RE_ARGS((const char *pattern, size_t size, + reg_syntax_t syntax, + struct re_pattern_buffer *bufp)); + static void store_op1 _RE_ARGS((re_opcode_t op, unsigned char *loc, int arg)); + static void store_op2 _RE_ARGS((re_opcode_t op, unsigned char *loc, + int arg1, int arg2)); + static void insert_op1 _RE_ARGS((re_opcode_t op, unsigned char *loc, + int arg, unsigned char *end)); + static void insert_op2 _RE_ARGS((re_opcode_t op, unsigned char *loc, + int arg1, int arg2, unsigned char *end)); + static boolean at_begline_loc_p _RE_ARGS((const char *pattern, const char *p, + reg_syntax_t syntax)); + static boolean at_endline_loc_p _RE_ARGS((const char *p, const char *pend, + reg_syntax_t syntax)); + static reg_errcode_t compile_range _RE_ARGS((const char **p_ptr, + const char *pend, + char *translate, + reg_syntax_t syntax, + unsigned char *b)); + + /* Fetch the next character in the uncompiled pattern---translating it + if necessary. Also cast from a signed character in the constant + string passed to us by the user to an unsigned char that we can use + as an array index (in, e.g., `translate'). */ + #define PATFETCH(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + if (translate) c = translate[c]; \ + } while (0) + + /* Fetch the next character in the uncompiled pattern, with no + translation. */ + #define PATFETCH_RAW(c) \ + do {if (p == pend) return REG_EEND; \ + c = (unsigned char) *p++; \ + } while (0) + + /* Go backwards one character in the pattern. */ + #define PATUNFETCH p-- + + + /* If `translate' is non-null, return translate[D], else just D. We + cast the subscript to translate because some data is declared as + `char *', to avoid warnings when a string constant is passed. But + when we use a character as a subscript we must make it unsigned. */ + #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (d)) + + + /* Macros for outputting the compiled pattern into `buffer'. */ + + /* If the buffer isn't allocated when it comes in, use this. */ + #define INIT_BUF_SIZE 32 + + /* Make sure we have at least N more bytes of space in buffer. */ + #define GET_BUFFER_SPACE(n) \ + while (b - bufp->buffer + (n) > bufp->allocated) \ + EXTEND_BUFFER () + + /* Make sure we have one more byte of buffer space and then add C to it. */ + #define BUF_PUSH(c) \ + do { \ + GET_BUFFER_SPACE (1); \ + *b++ = (unsigned char) (c); \ + } while (0) + + + /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ + #define BUF_PUSH_2(c1, c2) \ + do { \ + GET_BUFFER_SPACE (2); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + } while (0) + + + /* As with BUF_PUSH_2, except for three bytes. */ + #define BUF_PUSH_3(c1, c2, c3) \ + do { \ + GET_BUFFER_SPACE (3); \ + *b++ = (unsigned char) (c1); \ + *b++ = (unsigned char) (c2); \ + *b++ = (unsigned char) (c3); \ + } while (0) + + + /* Store a jump with opcode OP at LOC to location TO. We store a + relative address offset by the three bytes the jump itself occupies. */ + #define STORE_JUMP(op, loc, to) \ + store_op1 (op, loc, (int)((to) - (loc) - 3)) + + /* Likewise, for a two-argument jump. */ + #define STORE_JUMP2(op, loc, to, arg) \ + store_op2 (op, loc, (int)((to) - (loc) - 3), arg) + + /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ + #define INSERT_JUMP(op, loc, to) \ + insert_op1 (op, loc, (int)((to) - (loc) - 3), b) + + /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ + #define INSERT_JUMP2(op, loc, to, arg) \ + insert_op2 (op, loc, (int)((to) - (loc) - 3), arg, b) + + + /* This is not an arbitrary limit: the arguments which represent offsets + into the pattern are two bytes long. So if 2^16 bytes turns out to + be too small, many things would have to change. */ + /* Any other compiler which, like MSC, has allocation limit below 2^16 + bytes will have to use approach similar to what was done below for + MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up + reallocating to 0 bytes. Such thing is not going to work too well. + You have been warned!! */ + #ifdef _MSC_VER + /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. + The REALLOC define eliminates a flurry of conversion warnings, + but is not required. */ + #define MAX_BUF_SIZE 65500L + #define REALLOC(p,s) realloc((p), (size_t) (s)) + #else + #define MAX_BUF_SIZE (1L << 16) + #define REALLOC realloc + #endif + + /* Extend the buffer by twice its current size via realloc and + reset the pointers that pointed into the old block to point to the + correct places in the new one. If extending the buffer results in it + being larger than MAX_BUF_SIZE, then flag memory exhausted. */ + #define EXTEND_BUFFER() \ + do { \ + unsigned char *old_buffer = bufp->buffer; \ + if (bufp->allocated == MAX_BUF_SIZE) \ + return REG_ESIZE; \ + bufp->allocated <<= 1; \ + if (bufp->allocated > MAX_BUF_SIZE) \ + bufp->allocated = MAX_BUF_SIZE; \ + bufp->buffer = (unsigned char *) REALLOC(bufp->buffer, bufp->allocated);\ + if (bufp->buffer == NULL) \ + return REG_ESPACE; \ + /* If the buffer moved, move all the pointers into it. */ \ + if (old_buffer != bufp->buffer) \ + { \ + b = (b - old_buffer) + bufp->buffer; \ + begalt = (begalt - old_buffer) + bufp->buffer; \ + if (fixup_alt_jump) \ + fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\ + if (laststart) \ + laststart = (laststart - old_buffer) + bufp->buffer; \ + if (pending_exact) \ + pending_exact = (pending_exact - old_buffer) + bufp->buffer; \ + } \ + } while (0) + + + /* Since we have one byte reserved for the register number argument to + {start,stop}_memory, the maximum number of groups we can report + things about is what fits in that byte. */ + #define MAX_REGNUM 255 + + /* But patterns can have more than `MAX_REGNUM' registers. We just + ignore the excess. */ + typedef unsigned regnum_t; + + + /* Macros for the compile stack. */ + + /* Since offsets can go either forwards or backwards, this type needs to + be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ + /* int may be not enough when sizeof(int) == 2 */ + typedef long pattern_offset_t; + + typedef struct + { + pattern_offset_t begalt_offset; + pattern_offset_t fixup_alt_jump; + pattern_offset_t inner_group_offset; + pattern_offset_t laststart_offset; + regnum_t regnum; + } compile_stack_elt_t; + + + typedef struct + { + compile_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ + } compile_stack_type; + + + #define INIT_COMPILE_STACK_SIZE 32 + + #define COMPILE_STACK_EMPTY (compile_stack.avail == 0) + #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) + + /* The next available element. */ + #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) + + + /* Set the bit for character C in a list. */ + #define SET_LIST_BIT(c) \ + (b[((unsigned char) (c)) / BYTEWIDTH] \ + |= 1 << (((unsigned char) c) % BYTEWIDTH)) + + + /* Get the next unsigned number in the uncompiled pattern. */ + #define GET_UNSIGNED_NUMBER(num) \ + { if (p != pend) \ + { \ + PATFETCH (c); \ + while (ISDIGIT (c)) \ + { \ + if (num < 0) \ + num = 0; \ + num = num * 10 + c - '0'; \ + if (p == pend) \ + break; \ + PATFETCH (c); \ + } \ + } \ + } + + #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ + + #define IS_CHAR_CLASS(string) \ + (STREQ (string, "alpha") || STREQ (string, "upper") \ + || STREQ (string, "lower") || STREQ (string, "digit") \ + || STREQ (string, "alnum") || STREQ (string, "xdigit") \ + || STREQ (string, "space") || STREQ (string, "print") \ + || STREQ (string, "punct") || STREQ (string, "graph") \ + || STREQ (string, "cntrl") || STREQ (string, "blank")) + + static boolean group_in_compile_stack _RE_ARGS((compile_stack_type + compile_stack, + regnum_t regnum)); + + /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. + Returns one of error codes defined in `regex.h', or zero for success. + + Assumes the `allocated' (and perhaps `buffer') and `translate' + fields are set in BUFP on entry. + + If it succeeds, results are put in BUFP (if it returns an error, the + contents of BUFP are undefined): + `buffer' is the compiled pattern; + `syntax' is set to SYNTAX; + `used' is set to the length of the compiled pattern; + `fastmap_accurate' is zero; + `re_nsub' is the number of subexpressions in PATTERN; + `not_bol' and `not_eol' are zero; + + The `fastmap' and `newline_anchor' fields are neither + examined nor set. */ + + static reg_errcode_t + regex_compile (pattern, size, syntax, bufp) + const char *pattern; + size_t size; + reg_syntax_t syntax; + struct re_pattern_buffer *bufp; + { + /* We fetch characters from PATTERN here. Even though PATTERN is + `char *' (i.e., signed), we declare these variables as unsigned, so + they can be reliably used as array indices. */ + register unsigned char c, c1; + + /* A random tempory spot in PATTERN. */ + const char *p1; + + /* Points to the end of the buffer, where we should append. */ + register unsigned char *b; + + /* Keeps track of unclosed groups. */ + compile_stack_type compile_stack; + + /* Points to the current (ending) position in the pattern. */ + const char *p = pattern; + const char *pend = pattern + size; + + /* How to translate the characters in the pattern. */ + char *translate = bufp->translate; + + /* Address of the count-byte of the most recently inserted `exactn' + command. This makes it possible to tell if a new exact-match + character can be added to that command or if the character requires + a new `exactn' command. */ + unsigned char *pending_exact = 0; + + /* Address of start of the most recently finished expression. + This tells, e.g., postfix * where to find the start of its + operand. Reset at the beginning of groups and alternatives. */ + unsigned char *laststart = 0; + + /* Address of beginning of regexp, or inside of last group. */ + unsigned char *begalt; + + /* Place in the uncompiled pattern (i.e., the {) to + which to go back if the interval is invalid. */ + const char *beg_interval; + + /* Address of the place where a forward jump should go to the end of + the containing expression. Each alternative of an `or' -- except the + last -- ends with a forward jump of this sort. */ + unsigned char *fixup_alt_jump = 0; + + /* Counts open-groups as they are encountered. Remembered for the + matching close-group on the compile stack, so the same register + number is put in the stop_memory as the start_memory. */ + regnum_t regnum = 0; + + #ifdef DEBUG + DEBUG_PRINT1 ("\nCompiling pattern: "); + if (debug) + { + unsigned debug_count; + + for (debug_count = 0; debug_count < size; debug_count++) + printchar (pattern[debug_count]); + putchar ('\n'); + } + #endif /* DEBUG */ + + /* Initialize the compile stack. */ + compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); + if (compile_stack.stack == NULL) + return REG_ESPACE; + + compile_stack.size = INIT_COMPILE_STACK_SIZE; + compile_stack.avail = 0; + + /* Initialize the pattern buffer. */ + bufp->syntax = syntax; + bufp->fastmap_accurate = 0; + bufp->not_bol = bufp->not_eol = 0; + + /* Set `used' to zero, so that if we return an error, the pattern + printer (for debugging) will think there's no pattern. We reset it + at the end. */ + bufp->used = 0; + + /* Always count groups, whether or not bufp->no_sub is set. */ + bufp->re_nsub = 0; + + #if !defined (emacs) && !defined (SYNTAX_TABLE) + /* Initialize the syntax table. */ + init_syntax_once (); + #endif + + if (bufp->allocated == 0) + { + if (bufp->buffer) + { /* If zero allocated, but buffer is non-null, try to realloc + enough space. This loses if buffer's address is bogus, but + that is the user's responsibility. */ + RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char); + } + else + { /* Caller did not allocate a buffer. Do it for them. */ + bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char); + } + if (!bufp->buffer) return REG_ESPACE; + + bufp->allocated = INIT_BUF_SIZE; + } + + begalt = b = bufp->buffer; + + /* Loop through the uncompiled pattern until we're at the end. */ + while (p != pend) + { + PATFETCH (c); + + switch (c) + { + case '^': + { + if ( /* If at start of pattern, it's an operator. */ + p == pattern + 1 + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's come before. */ + || at_begline_loc_p (pattern, p, syntax)) + BUF_PUSH (begline); + else + goto normal_char; + } + break; + + + case '$': + { + if ( /* If at end of pattern, it's an operator. */ + p == pend + /* If context independent, it's an operator. */ + || syntax & RE_CONTEXT_INDEP_ANCHORS + /* Otherwise, depends on what's next. */ + || at_endline_loc_p (p, pend, syntax)) + BUF_PUSH (endline); + else + goto normal_char; + } + break; + + + case '+': + case '?': + if ((syntax & RE_BK_PLUS_QM) + || (syntax & RE_LIMITED_OPS)) + goto normal_char; + handle_plus: + case '*': + /* If there is no previous pattern... */ + if (!laststart) + { + if (syntax & RE_CONTEXT_INVALID_OPS) + return REG_BADRPT; + else if (!(syntax & RE_CONTEXT_INDEP_OPS)) + goto normal_char; + } + + { + /* Are we optimizing this jump? */ + boolean keep_string_p = false; + + /* 1 means zero (many) matches is allowed. */ + char zero_times_ok = 0, many_times_ok = 0; + + /* If there is a sequence of repetition chars, collapse it + down to just one (the right one). We can't combine + interval operators with these because of, e.g., `a{2}*', + which should only match an even number of `a's. */ + + for (;;) + { + zero_times_ok |= c != '+'; + many_times_ok |= c != '?'; + + if (p == pend) + break; + + PATFETCH (c); + + if (c == '*' + || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) + ; + + else if (syntax & RE_BK_PLUS_QM && c == '\\') + { + if (p == pend) return REG_EESCAPE; + + PATFETCH (c1); + if (!(c1 == '+' || c1 == '?')) + { + PATUNFETCH; + PATUNFETCH; + break; + } + + c = c1; + } + else + { + PATUNFETCH; + break; + } + + /* If we get here, we found another repeat character. */ + } + + /* Star, etc. applied to an empty pattern is equivalent + to an empty pattern. */ + if (!laststart) + break; + + /* Now we know whether or not zero matches is allowed + and also whether or not two or more matches is allowed. */ + if (many_times_ok) + { /* More than one repetition is allowed, so put in at the + end a backward relative jump from `b' to before the next + jump we're going to put in below (which jumps from + laststart to after this jump). + + But if we are at the `*' in the exact sequence `.*\n', + insert an unconditional jump backwards to the ., + instead of the beginning of the loop. This way we only + push a failure point once, instead of every time + through the loop. */ + assert (p - 1 > pattern); + + /* Allocate the space for the jump. */ + GET_BUFFER_SPACE (3); + + /* We know we are not at the first character of the pattern, + because laststart was nonzero. And we've already + incremented `p', by the way, to be the character after + the `*'. Do we have to do something analogous here + for null bytes, because of RE_DOT_NOT_NULL? */ + if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') + && zero_times_ok + && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') + && !(syntax & RE_DOT_NEWLINE)) + { /* We have .*\n. */ + STORE_JUMP (jump, b, laststart); + keep_string_p = true; + } + else + /* Anything else. */ + STORE_JUMP (maybe_pop_jump, b, laststart - 3); + + /* We've added more stuff to the buffer. */ + b += 3; + } + + /* On failure, jump from laststart to b + 3, which will be the + end of the buffer after this jump is inserted. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump + : on_failure_jump, + laststart, b + 3); + pending_exact = 0; + b += 3; + + if (!zero_times_ok) + { + /* At least one repetition is required, so insert a + `dummy_failure_jump' before the initial + `on_failure_jump' instruction of the loop. This + effects a skip over that instruction the first time + we hit that loop. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6); + b += 3; + } + } + break; + + + case '.': + laststart = b; + BUF_PUSH (anychar); + break; + + + case '[': + { + boolean had_char_class = false; + + if (p == pend) return REG_EBRACK; + + /* Ensure that we have enough space to push a charset: the + opcode, the length count, and the bitset; 34 bytes in all. */ + GET_BUFFER_SPACE (34); + + laststart = b; + + /* We test `*p == '^' twice, instead of using an if + statement, so we only need one BUF_PUSH. */ + BUF_PUSH (*p == '^' ? charset_not : charset); + if (*p == '^') + p++; + + /* Remember the first position in the bracket expression. */ + p1 = p; + + /* Push the number of bytes in the bitmap. */ + BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); + + /* Clear the whole map. */ + bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); + + /* charset_not matches newline according to a syntax bit. */ + if ((re_opcode_t) b[-2] == charset_not + && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) + SET_LIST_BIT ('\n'); + + /* Read in characters and ranges, setting map bits. */ + for (;;) + { + if (p == pend) return REG_EBRACK; + + PATFETCH (c); + + /* \ might escape characters inside [...] and [^...]. */ + if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') + { + if (p == pend) return REG_EESCAPE; + + PATFETCH (c1); + SET_LIST_BIT (c1); + continue; + } + + /* Could be the end of the bracket expression. If it's + not (i.e., when the bracket expression is `[]' so + far), the ']' character bit gets set way below. */ + if (c == ']' && p != p1 + 1) + break; + + /* Look ahead to see if it's a range when the last thing + was a character class. */ + if (had_char_class && c == '-' && *p != ']') + return REG_ERANGE; + + /* Look ahead to see if it's a range when the last thing + was a character: if this is a hyphen not at the + beginning or the end of a list, then it's the range + operator. */ + if (c == '-' + && !(p - 2 >= pattern && p[-2] == '[') + && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') + && *p != ']') + { + reg_errcode_t ret + = compile_range (&p, pend, translate, syntax, b); + if (ret != REG_NOERROR) return ret; + } + + else if (p[0] == '-' && p[1] != ']') + { /* This handles ranges made up of characters only. */ + reg_errcode_t ret; + + /* Move past the `-'. */ + PATFETCH (c1); + + ret = compile_range (&p, pend, translate, syntax, b); + if (ret != REG_NOERROR) return ret; + } + + /* See if we're at the beginning of a possible character + class. */ + + else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') + { /* Leave room for the null. */ + char str[CHAR_CLASS_MAX_LENGTH + 1]; + + PATFETCH (c); + c1 = 0; + + /* If pattern is `[[:'. */ + if (p == pend) return REG_EBRACK; + + for (;;) + { + PATFETCH (c); + if (c == ':' || c == ']' || p == pend + || c1 == CHAR_CLASS_MAX_LENGTH) + break; + str[c1++] = c; + } + str[c1] = '\0'; + + /* If isn't a word bracketed by `[:' and:`]': + undo the ending character, the letters, and leave + the leading `:' and `[' (but set bits for them). */ + if (c == ':' && *p == ']') + { + int ch; + boolean is_alnum = STREQ (str, "alnum"); + boolean is_alpha = STREQ (str, "alpha"); + boolean is_blank = STREQ (str, "blank"); + boolean is_cntrl = STREQ (str, "cntrl"); + boolean is_digit = STREQ (str, "digit"); + boolean is_graph = STREQ (str, "graph"); + boolean is_lower = STREQ (str, "lower"); + boolean is_print = STREQ (str, "print"); + boolean is_punct = STREQ (str, "punct"); + boolean is_space = STREQ (str, "space"); + boolean is_upper = STREQ (str, "upper"); + boolean is_xdigit = STREQ (str, "xdigit"); + + if (!IS_CHAR_CLASS (str)) return REG_ECTYPE; + + /* Throw away the ] at the end of the character + class. */ + PATFETCH (c); + + if (p == pend) return REG_EBRACK; + + for (ch = 0; ch < 1 << BYTEWIDTH; ch++) + { + if ( (is_alnum && ISALNUM (ch)) + || (is_alpha && ISALPHA (ch)) + || (is_blank && ISBLANK (ch)) + || (is_cntrl && ISCNTRL (ch)) + || (is_digit && ISDIGIT (ch)) + || (is_graph && ISGRAPH (ch)) + || (is_lower && ISLOWER (ch)) + || (is_print && ISPRINT (ch)) + || (is_punct && ISPUNCT (ch)) + || (is_space && ISSPACE (ch)) + || (is_upper && ISUPPER (ch)) + || (is_xdigit && ISXDIGIT (ch))) + SET_LIST_BIT (ch); + } + had_char_class = true; + } + else + { + c1++; + while (c1--) + PATUNFETCH; + SET_LIST_BIT ('['); + SET_LIST_BIT (':'); + had_char_class = false; + } + } + else + { + had_char_class = false; + SET_LIST_BIT (c); + } + } + + /* Discard any (non)matching list bytes that are all 0 at the + end of the map. Decrease the map-length byte too. */ + while ((int) b[-1] > 0 && b[b[-1] - 1] == 0) + b[-1]--; + b += b[-1]; + } + break; + + + case '(': + if (syntax & RE_NO_BK_PARENS) + goto handle_open; + else + goto normal_char; + + + case ')': + if (syntax & RE_NO_BK_PARENS) + goto handle_close; + else + goto normal_char; + + + case '\n': + if (syntax & RE_NEWLINE_ALT) + goto handle_alt; + else + goto normal_char; + + + case '|': + if (syntax & RE_NO_BK_VBAR) + goto handle_alt; + else + goto normal_char; + + + case '{': + if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES) + goto handle_interval; + else + goto normal_char; + + + case '\\': + if (p == pend) return REG_EESCAPE; + + /* Do not translate the character after the \, so that we can + distinguish, e.g., \B from \b, even if we normally would + translate, e.g., B to b. */ + PATFETCH_RAW (c); + + switch (c) + { + case '(': + if (syntax & RE_NO_BK_PARENS) + goto normal_backslash; + + handle_open: + bufp->re_nsub++; + regnum++; + + if (COMPILE_STACK_FULL) + { + RETALLOC (compile_stack.stack, compile_stack.size << 1, + compile_stack_elt_t); + if (compile_stack.stack == NULL) return REG_ESPACE; + + compile_stack.size <<= 1; + } + + /* These are the values to restore when we hit end of this + group. They are all relative offsets, so that if the + whole pattern moves because of realloc, they will still + be valid. */ + COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer; + COMPILE_STACK_TOP.fixup_alt_jump + = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0; + COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer; + COMPILE_STACK_TOP.regnum = regnum; + + /* We will eventually replace the 0 with the number of + groups inner to this one. But do not push a + start_memory for groups beyond the last one we can + represent in the compiled pattern. */ + if (regnum <= MAX_REGNUM) + { + COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2; + BUF_PUSH_3 (start_memory, regnum, 0); + } + + compile_stack.avail++; + + fixup_alt_jump = 0; + laststart = 0; + begalt = b; + /* If we've reached MAX_REGNUM groups, then this open + won't actually generate any code, so we'll have to + clear pending_exact explicitly. */ + pending_exact = 0; + break; + + + case ')': + if (syntax & RE_NO_BK_PARENS) goto normal_backslash; + + if (COMPILE_STACK_EMPTY) + if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_backslash; + else + return REG_ERPAREN; + + handle_close: + if (fixup_alt_jump) + { /* Push a dummy failure point at the end of the + alternative for a possible future + `pop_failure_jump' to pop. See comments at + `push_dummy_failure' in `re_match_2'. */ + BUF_PUSH (push_dummy_failure); + + /* We allocated space for this jump when we assigned + to `fixup_alt_jump', in the `handle_alt' case below. */ + STORE_JUMP (jump_past_alt, fixup_alt_jump, b - 1); + } + + /* See similar code for backslashed left paren above. */ + if (COMPILE_STACK_EMPTY) + if (syntax & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_char; + else + return REG_ERPAREN; + + /* Since we just checked for an empty stack above, this + ``can't happen''. */ + assert (compile_stack.avail != 0); + { + /* We don't just want to restore into `regnum', because + later groups should continue to be numbered higher, + as in `(ab)c(de)' -- the second group is #2. */ + regnum_t this_group_regnum; + + compile_stack.avail--; + begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset; + fixup_alt_jump + = COMPILE_STACK_TOP.fixup_alt_jump + ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1 + : 0; + laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset; + this_group_regnum = COMPILE_STACK_TOP.regnum; + /* If we've reached MAX_REGNUM groups, then this open + won't actually generate any code, so we'll have to + clear pending_exact explicitly. */ + pending_exact = 0; + + /* We're at the end of the group, so now we know how many + groups were inside this one. */ + if (this_group_regnum <= MAX_REGNUM) + { + unsigned char *inner_group_loc + = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset; + + *inner_group_loc = regnum - this_group_regnum; + BUF_PUSH_3 (stop_memory, this_group_regnum, + regnum - this_group_regnum); + } + } + break; + + + case '|': /* `\|'. */ + if (syntax & RE_LIMITED_OPS || syntax & RE_NO_BK_VBAR) + goto normal_backslash; + handle_alt: + if (syntax & RE_LIMITED_OPS) + goto normal_char; + + /* Insert before the previous alternative a jump which + jumps to this alternative if the former fails. */ + GET_BUFFER_SPACE (3); + INSERT_JUMP (on_failure_jump, begalt, b + 6); + pending_exact = 0; + b += 3; + + /* The alternative before this one has a jump after it + which gets executed if it gets matched. Adjust that + jump so it will jump to this alternative's analogous + jump (put in below, which in turn will jump to the next + (if any) alternative's such jump, etc.). The last such + jump jumps to the correct final destination. A picture: + _____ _____ + | | | | + | v | v + a | b | c + + If we are at `b', then fixup_alt_jump right now points to a + three-byte space after `a'. We'll put in the jump, set + fixup_alt_jump to right after `b', and leave behind three + bytes which we'll fill in when we get to after `c'. */ + + if (fixup_alt_jump) + STORE_JUMP (jump_past_alt, fixup_alt_jump, b); + + /* Mark and leave space for a jump after this alternative, + to be filled in later either by next alternative or + when know we're at the end of a series of alternatives. */ + fixup_alt_jump = b; + GET_BUFFER_SPACE (3); + b += 3; + + laststart = 0; + begalt = b; + break; + + + case '{': + /* If \{ is a literal. */ + if (!(syntax & RE_INTERVALS) + /* If we're at `\{' and it's not the open-interval + operator. */ + || ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES)) + || (p - 2 == pattern && p == pend)) + goto normal_backslash; + + handle_interval: + { + /* If got here, then the syntax allows intervals. */ + + /* At least (most) this many matches must be made. */ + int lower_bound = -1, upper_bound = -1; + + beg_interval = p - 1; + + if (p == pend) + { + if (syntax & RE_NO_BK_BRACES) + goto unfetch_interval; + else + return REG_EBRACE; + } + + GET_UNSIGNED_NUMBER (lower_bound); + + if (c == ',') + { + GET_UNSIGNED_NUMBER (upper_bound); + if (upper_bound < 0) upper_bound = RE_DUP_MAX; + } + else + /* Interval such as `{1}' => match exactly once. */ + upper_bound = lower_bound; + + if (lower_bound < 0 || upper_bound > RE_DUP_MAX + || lower_bound > upper_bound) + { + if (syntax & RE_NO_BK_BRACES) + goto unfetch_interval; + else + return REG_BADBR; + } + + if (!(syntax & RE_NO_BK_BRACES)) + { + if (c != '\\') return REG_EBRACE; + + PATFETCH (c); + } + + if (c != '}') + { + if (syntax & RE_NO_BK_BRACES) + goto unfetch_interval; + else + return REG_BADBR; + } + + /* We just parsed a valid interval. */ + + /* If it's invalid to have no preceding re. */ + if (!laststart) + { + if (syntax & RE_CONTEXT_INVALID_OPS) + return REG_BADRPT; + else if (syntax & RE_CONTEXT_INDEP_OPS) + laststart = b; + else + goto unfetch_interval; + } + + /* If the upper bound is zero, don't want to succeed at + all; jump from `laststart' to `b + 3', which will be + the end of the buffer after we insert the jump. */ + if (upper_bound == 0) + { + GET_BUFFER_SPACE (3); + INSERT_JUMP (jump, laststart, b + 3); + b += 3; + } + + /* Otherwise, we have a nontrivial interval. When + we're all done, the pattern will look like: + set_number_at + set_number_at + succeed_n + + jump_n + (The upper bound and `jump_n' are omitted if + `upper_bound' is 1, though.) */ + else + { /* If the upper bound is > 1, we need to insert + more at the end of the loop. */ + unsigned nbytes = 10 + (upper_bound > 1) * 10; + + GET_BUFFER_SPACE (nbytes); + + /* Initialize lower bound of the `succeed_n', even + though it will be set during matching by its + attendant `set_number_at' (inserted next), + because `re_compile_fastmap' needs to know. + Jump to the `jump_n' we might insert below. */ + INSERT_JUMP2 (succeed_n, laststart, + b + 5 + (upper_bound > 1) * 5, + lower_bound); + b += 5; + + /* Code to initialize the lower bound. Insert + before the `succeed_n'. The `5' is the last two + bytes of this `set_number_at', plus 3 bytes of + the following `succeed_n'. */ + insert_op2 (set_number_at, laststart, 5, lower_bound, b); + b += 5; + + if (upper_bound > 1) + { /* More than one repetition is allowed, so + append a backward jump to the `succeed_n' + that starts this interval. + + When we've reached this during matching, + we'll have matched the interval once, so + jump back only `upper_bound - 1' times. */ + STORE_JUMP2 (jump_n, b, laststart + 5, + upper_bound - 1); + b += 5; + + /* The location we want to set is the second + parameter of the `jump_n'; that is `b-2' as + an absolute address. `laststart' will be + the `set_number_at' we're about to insert; + `laststart+3' the number to set, the source + for the relative address. But we are + inserting into the middle of the pattern -- + so everything is getting moved up by 5. + Conclusion: (b - 2) - (laststart + 3) + 5, + i.e., b - laststart. + + We insert this at the beginning of the loop + so that if we fail during matching, we'll + reinitialize the bounds. */ + insert_op2 (set_number_at, laststart, b - laststart, + upper_bound - 1, b); + b += 5; + } + } + pending_exact = 0; + beg_interval = NULL; + } + break; + + unfetch_interval: + /* If an invalid interval, match the characters as literals. */ + assert (beg_interval); + p = beg_interval; + beg_interval = NULL; + + /* normal_char and normal_backslash need `c'. */ + PATFETCH (c); + + if (!(syntax & RE_NO_BK_BRACES)) + { + if (p > pattern && p[-1] == '\\') + goto normal_backslash; + } + goto normal_char; + + #ifdef emacs + /* There is no way to specify the before_dot and after_dot + operators. rms says this is ok. --karl */ + case '=': + BUF_PUSH (at_dot); + break; + + case 's': + laststart = b; + PATFETCH (c); + BUF_PUSH_2 (syntaxspec, syntax_spec_code[c]); + break; + + case 'S': + laststart = b; + PATFETCH (c); + BUF_PUSH_2 (notsyntaxspec, syntax_spec_code[c]); + break; + #endif /* emacs */ + + + case 'w': + if (re_syntax_options & RE_NO_GNU_OPS) + goto normal_char; + laststart = b; + BUF_PUSH (wordchar); + break; + + + case 'W': + if (re_syntax_options & RE_NO_GNU_OPS) + goto normal_char; + laststart = b; + BUF_PUSH (notwordchar); + break; + + + case '<': + if (re_syntax_options & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH (wordbeg); + break; + + case '>': + if (re_syntax_options & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH (wordend); + break; + + case 'b': + if (re_syntax_options & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH (wordbound); + break; + + case 'B': + if (re_syntax_options & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH (notwordbound); + break; + + case '`': + if (re_syntax_options & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH (begbuf); + break; + + case '\'': + if (re_syntax_options & RE_NO_GNU_OPS) + goto normal_char; + BUF_PUSH (endbuf); + break; + + case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + if (syntax & RE_NO_BK_REFS) + goto normal_char; + + c1 = c - '0'; + + if (c1 > regnum) + return REG_ESUBREG; + + /* Can't back reference to a subexpression if inside of it. */ + if (group_in_compile_stack (compile_stack, (regnum_t)c1)) + goto normal_char; + + laststart = b; + BUF_PUSH_2 (duplicate, c1); + break; + + + case '+': + case '?': + if (syntax & RE_BK_PLUS_QM) + goto handle_plus; + else + goto normal_backslash; + + default: + normal_backslash: + /* You might think it would be useful for \ to mean + not to translate; but if we don't translate it + it will never match anything. */ + c = TRANSLATE (c); + goto normal_char; + } + break; + + + default: + /* Expects the character in `c'. */ + normal_char: + /* If no exactn currently being built. */ + if (!pending_exact + + /* If last exactn not at current position. */ + || pending_exact + *pending_exact + 1 != b + + /* We have only one byte following the exactn for the count. */ + || *pending_exact == (1 << BYTEWIDTH) - 1 + + /* If followed by a repetition operator. */ + || *p == '*' || *p == '^' + || ((syntax & RE_BK_PLUS_QM) + ? *p == '\\' && (p[1] == '+' || p[1] == '?') + : (*p == '+' || *p == '?')) + || ((syntax & RE_INTERVALS) + && ((syntax & RE_NO_BK_BRACES) + ? *p == '{' + : (p[0] == '\\' && p[1] == '{')))) + { + /* Start building a new exactn. */ + + laststart = b; + + BUF_PUSH_2 (exactn, 0); + pending_exact = b - 1; + } + + BUF_PUSH (c); + (*pending_exact)++; + break; + } /* switch (c) */ + } /* while p != pend */ + + + /* Through the pattern now. */ + + if (fixup_alt_jump) + STORE_JUMP (jump_past_alt, fixup_alt_jump, b); + + if (!COMPILE_STACK_EMPTY) + return REG_EPAREN; + + free (compile_stack.stack); + + /* We have succeeded; set the length of the buffer. */ + bufp->used = b - bufp->buffer; + + #ifdef DEBUG + if (debug) + { + DEBUG_PRINT1 ("\nCompiled pattern: \n"); + print_compiled_pattern (bufp); + } + #endif /* DEBUG */ + + return REG_NOERROR; + } /* regex_compile */ + + /* Subroutines for `regex_compile'. */ + + /* Store OP at LOC followed by two-byte integer parameter ARG. */ + + static void + store_op1 (op, loc, arg) + re_opcode_t op; + unsigned char *loc; + int arg; + { + *loc = (unsigned char) op; + STORE_NUMBER (loc + 1, arg); + } + + + /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */ + + static void + store_op2 (op, loc, arg1, arg2) + re_opcode_t op; + unsigned char *loc; + int arg1, arg2; + { + *loc = (unsigned char) op; + STORE_NUMBER (loc + 1, arg1); + STORE_NUMBER (loc + 3, arg2); + } + + + /* Copy the bytes from LOC to END to open up three bytes of space at LOC + for OP followed by two-byte integer parameter ARG. */ + + static void + insert_op1 (op, loc, arg, end) + re_opcode_t op; + unsigned char *loc; + int arg; + unsigned char *end; + { + register unsigned char *pfrom = end; + register unsigned char *pto = end + 3; + + while (pfrom != loc) + *--pto = *--pfrom; + + store_op1 (op, loc, arg); + } + + + /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */ + + static void + insert_op2 (op, loc, arg1, arg2, end) + re_opcode_t op; + unsigned char *loc; + int arg1, arg2; + unsigned char *end; + { + register unsigned char *pfrom = end; + register unsigned char *pto = end + 5; + + while (pfrom != loc) + *--pto = *--pfrom; + + store_op2 (op, loc, arg1, arg2); + } + + + /* P points to just after a ^ in PATTERN. Return true if that ^ comes + after an alternative or a begin-subexpression. We assume there is at + least one character before the ^. */ + + static boolean + at_begline_loc_p (pattern, p, syntax) + const char *pattern, *p; + reg_syntax_t syntax; + { + const char *prev = p - 2; + boolean prev_prev_backslash = prev > pattern && prev[-1] == '\\'; + + return + /* After a subexpression? */ + (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash)) + /* After an alternative? */ + || (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash)); + } + + + /* The dual of at_begline_loc_p. This one is for $. We assume there is + at least one character after the $, i.e., `P < PEND'. */ + + static boolean + at_endline_loc_p (p, pend, syntax) + const char *p, *pend; + reg_syntax_t syntax; + { + const char *next = p; + boolean next_backslash = *next == '\\'; + const char *next_next = p + 1 < pend ? p + 1 : NULL; + + return + /* Before a subexpression? */ + (syntax & RE_NO_BK_PARENS ? *next == ')' + : next_backslash && next_next && *next_next == ')') + /* Before an alternative? */ + || (syntax & RE_NO_BK_VBAR ? *next == '|' + : next_backslash && next_next && *next_next == '|'); + } + + + /* Returns true if REGNUM is in one of COMPILE_STACK's elements and + false if it's not. */ + + static boolean + group_in_compile_stack (compile_stack, regnum) + compile_stack_type compile_stack; + regnum_t regnum; + { + int this_element; + + for (this_element = compile_stack.avail - 1; + this_element >= 0; + this_element--) + if (compile_stack.stack[this_element].regnum == regnum) + return true; + + return false; + } + + + /* Read the ending character of a range (in a bracket expression) from the + uncompiled pattern *P_PTR (which ends at PEND). We assume the + starting character is in `P[-2]'. (`P[-1]' is the character `-'.) + Then we set the translation of all bits between the starting and + ending characters (inclusive) in the compiled pattern B. + + Return an error code. + + We use these short variable names so we can use the same macros as + `regex_compile' itself. */ + + static reg_errcode_t + compile_range (p_ptr, pend, translate, syntax, b) + const char **p_ptr, *pend; + char *translate; + reg_syntax_t syntax; + unsigned char *b; + { + unsigned this_char; + + const char *p = *p_ptr; + int range_start, range_end; + + if (p == pend) + return REG_ERANGE; + + /* Even though the pattern is a signed `char *', we need to fetch + with unsigned char *'s; if the high bit of the pattern character + is set, the range endpoints will be negative if we fetch using a + signed char *. + + We also want to fetch the endpoints without translating them; the + appropriate translation is done in the bit-setting loop below. */ + range_start = ((unsigned char *) p)[-2]; + range_end = ((unsigned char *) p)[0]; + + /* Have to increment the pointer into the pattern string, so the + caller isn't still at the ending character. */ + (*p_ptr)++; + + /* If the start is after the end, the range is empty. */ + if (range_start > range_end) + return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR; + + /* Here we see why `this_char' has to be larger than an `unsigned + char' -- the range is inclusive, so if `range_end' == 0xff + (assuming 8-bit characters), we would otherwise go into an infinite + loop, since all characters <= 0xff. */ + for (this_char = range_start; this_char <= range_end; this_char++) + { + SET_LIST_BIT (TRANSLATE (this_char)); + } + + return REG_NOERROR; + } + + /* Failure stack declarations and macros; both re_compile_fastmap and + re_match_2 use a failure stack. These have to be macros because of + REGEX_ALLOCATE. */ + + + /* Number of failure points for which to initially allocate space + when matching. If this number is exceeded, we allocate more + space, so it is not a hard limit. */ + #ifndef INIT_FAILURE_ALLOC + #define INIT_FAILURE_ALLOC 5 + #endif + + /* Roughly the maximum number of failure points on the stack. Would be + exactly that if always used MAX_FAILURE_SPACE each time we failed. + This is a variable only so users of regex can assign to it; we never + change it ourselves. */ + int re_max_failures = 2000; + + typedef const unsigned char *fail_stack_elt_t; + + typedef struct + { + fail_stack_elt_t *stack; + unsigned size; + unsigned avail; /* Offset of next open position. */ + } fail_stack_type; + + #define FAIL_STACK_EMPTY() (fail_stack.avail == 0) + #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) + #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) + #define FAIL_STACK_TOP() (fail_stack.stack[fail_stack.avail]) + + + /* Initialize `fail_stack'. Do `return -2' if the alloc fails. */ + + #define INIT_FAIL_STACK() \ + do { \ + fail_stack.stack = (fail_stack_elt_t *) \ + REGEX_ALLOCATE (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \ + \ + if (fail_stack.stack == NULL) \ + return -2; \ + \ + fail_stack.size = INIT_FAILURE_ALLOC; \ + fail_stack.avail = 0; \ + } while (0) + + + /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. + + Return 1 if succeeds, and 0 if either ran out of memory + allocating space for it or it was already too large. + + REGEX_REALLOCATE requires `destination' be declared. */ + + #define DOUBLE_FAIL_STACK(fail_stack) \ + ((fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \ + ? 0 \ + : ((fail_stack).stack = (fail_stack_elt_t *) \ + REGEX_REALLOCATE ((fail_stack).stack, \ + (fail_stack).size * sizeof (fail_stack_elt_t), \ + ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \ + \ + (fail_stack).stack == NULL \ + ? 0 \ + : ((fail_stack).size <<= 1, \ + 1))) + + + /* Push PATTERN_OP on FAIL_STACK. + + Return 1 if was able to do so and 0 if ran out of memory allocating + space to do so. */ + #define PUSH_PATTERN_OP(pattern_op, fail_stack) \ + ((FAIL_STACK_FULL () \ + && !DOUBLE_FAIL_STACK (fail_stack)) \ + ? 0 \ + : ((fail_stack).stack[(fail_stack).avail++] = pattern_op, \ + 1)) + + /* This pushes an item onto the failure stack. Must be a four-byte + value. Assumes the variable `fail_stack'. Probably should only + be called from within `PUSH_FAILURE_POINT'. */ + #define PUSH_FAILURE_ITEM(item) \ + fail_stack.stack[fail_stack.avail++] = (fail_stack_elt_t) item + + /* The complement operation. Assumes `fail_stack' is nonempty. */ + #define POP_FAILURE_ITEM() fail_stack.stack[--fail_stack.avail] + + /* Used to omit pushing failure point id's when we're not debugging. */ + #ifdef DEBUG + #define DEBUG_PUSH PUSH_FAILURE_ITEM + #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_ITEM () + #else + #define DEBUG_PUSH(item) + #define DEBUG_POP(item_addr) + #endif + + + /* Push the information about the state we will need + if we ever fail back to it. + + Requires variables fail_stack, regstart, regend, reg_info, and + num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be + declared. + + Does `return FAILURE_CODE' if runs out of memory. */ + + #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ + do { \ + char *destination; \ + /* Must be int, so when we don't save any registers, the arithmetic \ + of 0 + -1 isn't done as unsigned. */ \ + /* Can't be int, since there is not a shred of a guarantee that int \ + is wide enough to hold a value of something to which pointer can \ + be assigned */ \ + s_reg_t this_reg; \ + \ + DEBUG_STATEMENT (failure_id++); \ + DEBUG_STATEMENT (nfailure_points_pushed++); \ + DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ + DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ + DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ + \ + DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ + DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ + \ + /* Ensure we have enough space allocated for what we will push. */ \ + while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ + { \ + if (!DOUBLE_FAIL_STACK (fail_stack)) \ + return failure_code; \ + \ + DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ + (fail_stack).size); \ + DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ + } + + #define PUSH_FAILURE_POINT2(pattern_place, string_place, failure_code) \ + /* Push the info, starting with the registers. */ \ + DEBUG_PRINT1 ("\n"); \ + \ + PUSH_FAILURE_POINT_LOOP (); \ + \ + DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ + PUSH_FAILURE_ITEM (lowest_active_reg); \ + \ + DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ + PUSH_FAILURE_ITEM (highest_active_reg); \ + \ + DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \ + DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ + PUSH_FAILURE_ITEM (pattern_place); \ + \ + DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ + DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ + size2); \ + DEBUG_PRINT1 ("'\n"); \ + PUSH_FAILURE_ITEM (string_place); \ + \ + DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ + DEBUG_PUSH (failure_id); \ + } while (0) + + /* Pulled out of PUSH_FAILURE_POINT() to shorten the definition + of that macro. (for VAX C) */ + #define PUSH_FAILURE_POINT_LOOP() \ + for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ + this_reg++) \ + { \ + DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ + DEBUG_STATEMENT (num_regs_pushed++); \ + \ + DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ + PUSH_FAILURE_ITEM (regstart[this_reg]); \ + \ + DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ + PUSH_FAILURE_ITEM (regend[this_reg]); \ + \ + DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ + DEBUG_PRINT2 (" match_null=%d", \ + REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ + DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ + DEBUG_PRINT2 (" matched_something=%d", \ + MATCHED_SOMETHING (reg_info[this_reg])); \ + DEBUG_PRINT2 (" ever_matched=%d", \ + EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ + DEBUG_PRINT1 ("\n"); \ + PUSH_FAILURE_ITEM (reg_info[this_reg].word); \ + } + + /* This is the number of items that are pushed and popped on the stack + for each register. */ + #define NUM_REG_ITEMS 3 + + /* Individual items aside from the registers. */ + #ifdef DEBUG + #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ + #else + #define NUM_NONREG_ITEMS 4 + #endif + + /* We push at most this many items on the stack. */ + #define MAX_FAILURE_ITEMS ((num_regs - 1) * NUM_REG_ITEMS + NUM_NONREG_ITEMS) + + /* We actually push this many items. */ + #define NUM_FAILURE_ITEMS \ + ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \ + + NUM_NONREG_ITEMS) + + /* How many items can still be added to the stack without overflowing it. */ + #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) + + + /* Pops what PUSH_FAIL_STACK pushes. + + We restore into the parameters, all of which should be lvalues: + STR -- the saved data position. + PAT -- the saved pattern position. + LOW_REG, HIGH_REG -- the highest and lowest active registers. + REGSTART, REGEND -- arrays of string positions. + REG_INFO -- array of information about each subexpression. + + Also assumes the variables `fail_stack' and (if debugging), `bufp', + `pend', `string1', `size1', `string2', and `size2'. */ + + #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ + { \ + DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ + s_reg_t this_reg; \ + const unsigned char *string_temp; \ + \ + assert (!FAIL_STACK_EMPTY ()); \ + \ + /* Remove failure points and point to how many regs pushed. */ \ + DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ + DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ + DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ + \ + assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ + \ + DEBUG_POP (&failure_id); \ + DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ + \ + /* If the saved string location is NULL, it came from an \ + on_failure_keep_string_jump opcode, and we want to throw away the \ + saved NULL, thus retaining our current position in the string. */ \ + string_temp = POP_FAILURE_ITEM (); \ + if (string_temp != NULL) \ + str = (const char *) string_temp; \ + \ + DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ + DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ + DEBUG_PRINT1 ("'\n"); \ + \ + pat = (unsigned char *) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \ + DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ + \ + POP_FAILURE_POINT2 (low_reg, high_reg, regstart, regend, reg_info); + + /* Pulled out of POP_FAILURE_POINT() to shorten the definition + of that macro. (for MSC 5.1) */ + #define POP_FAILURE_POINT2(low_reg, high_reg, regstart, regend, reg_info) \ + \ + /* Restore register info. */ \ + high_reg = (active_reg_t) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ + \ + low_reg = (active_reg_t) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ + \ + for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ + { \ + DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ + \ + reg_info[this_reg].word = POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ + \ + regend[this_reg] = (const char *) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ + \ + regstart[this_reg] = (const char *) POP_FAILURE_ITEM (); \ + DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ + } \ + \ + DEBUG_STATEMENT (nfailure_points_popped++); \ + } /* POP_FAILURE_POINT */ + + + /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in + BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible + characters can start a string that matches the pattern. This fastmap + is used by re_search to skip quickly over impossible starting points. + + The caller must supply the address of a (1 << BYTEWIDTH)-byte data + area as BUFP->fastmap. + + We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in + the pattern buffer. + + Returns 0 if we succeed, -2 if an internal error. */ + + int + re_compile_fastmap (bufp) + struct re_pattern_buffer *bufp; + { + int j, k; + fail_stack_type fail_stack; + #ifndef REGEX_MALLOC + char *destination; + #endif + /* We don't push any register information onto the failure stack. */ + unsigned num_regs = 0; + + register char *fastmap = bufp->fastmap; + unsigned char *pattern = bufp->buffer; + const unsigned char *p = pattern; + register unsigned char *pend = pattern + bufp->used; + + /* Assume that each path through the pattern can be null until + proven otherwise. We set this false at the bottom of switch + statement, to which we get only if a particular path doesn't + match the empty string. */ + boolean path_can_be_null = true; + + /* We aren't doing a `succeed_n' to begin with. */ + boolean succeed_n_p = false; + + assert (fastmap != NULL && p != NULL); + + INIT_FAIL_STACK (); + bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */ + bufp->fastmap_accurate = 1; /* It will be when we're done. */ + bufp->can_be_null = 0; + + while (p != pend || !FAIL_STACK_EMPTY ()) + { + if (p == pend) + { + bufp->can_be_null |= path_can_be_null; + + /* Reset for next path. */ + path_can_be_null = true; + + p = fail_stack.stack[--fail_stack.avail]; + } + + /* We should never be about to go beyond the end of the pattern. */ + assert (p < pend); + + #ifdef SWITCH_ENUM_BUG + switch ((int) ((re_opcode_t) *p++)) + #else + switch ((re_opcode_t) *p++) + #endif + { + + /* I guess the idea here is to simply not bother with a fastmap + if a backreference is used, since it's too hard to figure out + the fastmap for the corresponding group. Setting + `can_be_null' stops `re_search_2' from using the fastmap, so + that is all we do. */ + case duplicate: + bufp->can_be_null = 1; + return 0; + + + /* Following are the cases which match a character. These end + with `break'. */ + + case exactn: + fastmap[p[1]] = 1; + break; + + + case charset: + for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) + if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))) + fastmap[j] = 1; + break; + + + case charset_not: + /* Chars beyond end of map must be allowed. */ + for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++) + fastmap[j] = 1; + + for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--) + if (!(p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))) + fastmap[j] = 1; + break; + + + case wordchar: + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) == Sword) + fastmap[j] = 1; + break; + + + case notwordchar: + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) != Sword) + fastmap[j] = 1; + break; + + + case anychar: + /* `.' matches anything ... */ + for (j = 0; j < (1 << BYTEWIDTH); j++) + fastmap[j] = 1; + + /* ... except perhaps newline. */ + if (!(bufp->syntax & RE_DOT_NEWLINE)) + fastmap['\n'] = 0; + + /* Return if we have already set `can_be_null'; if we have, + then the fastmap is irrelevant. Something's wrong here. */ + else if (bufp->can_be_null) + return 0; + + /* Otherwise, have to check alternative paths. */ + break; + + + #ifdef emacs + case syntaxspec: + k = *p++; + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) == (enum syntaxcode) k) + fastmap[j] = 1; + break; + + + case notsyntaxspec: + k = *p++; + for (j = 0; j < (1 << BYTEWIDTH); j++) + if (SYNTAX (j) != (enum syntaxcode) k) + fastmap[j] = 1; + break; + + + /* All cases after this match the empty string. These end with + `continue'. */ + + + case before_dot: + case at_dot: + case after_dot: + continue; + #endif /* not emacs */ + + + case no_op: + case begline: + case endline: + case begbuf: + case endbuf: + case wordbound: + case notwordbound: + case wordbeg: + case wordend: + case push_dummy_failure: + continue; + + + case jump_n: + case pop_failure_jump: + case maybe_pop_jump: + case jump: + case jump_past_alt: + case dummy_failure_jump: + EXTRACT_NUMBER_AND_INCR (j, p); + p += j; + if (j > 0) + continue; + + /* Jump backward implies we just went through the body of a + loop and matched nothing. Opcode jumped to should be + `on_failure_jump' or `succeed_n'. Just treat it like an + ordinary jump. For a * loop, it has pushed its failure + point already; if so, discard that as redundant. */ + if ((re_opcode_t) *p != on_failure_jump + && (re_opcode_t) *p != succeed_n) + continue; + + p++; + EXTRACT_NUMBER_AND_INCR (j, p); + p += j; + + /* If what's on the stack is where we are now, pop it. */ + if (!FAIL_STACK_EMPTY () + && fail_stack.stack[fail_stack.avail - 1] == p) + fail_stack.avail--; + + continue; + + + case on_failure_jump: + case on_failure_keep_string_jump: + handle_on_failure_jump: + EXTRACT_NUMBER_AND_INCR (j, p); + + /* For some patterns, e.g., `(a?)?', `p+j' here points to the + end of the pattern. We don't want to push such a point, + since when we restore it above, entering the switch will + increment `p' past the end of the pattern. We don't need + to push such a point since we obviously won't find any more + fastmap entries beyond `pend'. Such a pattern can match + the null string, though. */ + if (p + j < pend) + { + if (!PUSH_PATTERN_OP (p + j, fail_stack)) + return -2; + } + else + bufp->can_be_null = 1; + + if (succeed_n_p) + { + EXTRACT_NUMBER_AND_INCR (k, p); /* Skip the n. */ + succeed_n_p = false; + } + + continue; + + + case succeed_n: + /* Get to the number of times to succeed. */ + p += 2; + + /* Increment p past the n for when k != 0. */ + EXTRACT_NUMBER_AND_INCR (k, p); + if (k == 0) + { + p -= 4; + succeed_n_p = true; /* Spaghetti code alert. */ + goto handle_on_failure_jump; + } + continue; + + + case set_number_at: + p += 4; + continue; + + + case start_memory: + case stop_memory: + p += 2; + continue; + + + default: + abort (); /* We have listed all the cases. */ + } /* switch *p++ */ + + /* Getting here means we have found the possible starting + characters for one path of the pattern -- and that the empty + string does not match. We need not follow this path further. + Instead, look at the next alternative (remembered on the + stack), or quit if no more. The test at the top of the loop + does these things. */ + path_can_be_null = false; + p = pend; + } /* while p */ + + /* Set `can_be_null' for the last path (also the first path, if the + pattern is empty). */ + bufp->can_be_null |= path_can_be_null; + return 0; + } /* re_compile_fastmap */ + + /* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use + this memory for recording register information. STARTS and ENDS + must be allocated using the malloc library routine, and must each + be at least NUM_REGS * sizeof (regoff_t) bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ + + void + re_set_registers (bufp, regs, num_regs, starts, ends) + struct re_pattern_buffer *bufp; + struct re_registers *regs; + unsigned num_regs; + regoff_t *starts, *ends; + { + if (num_regs) + { + bufp->regs_allocated = REGS_REALLOCATE; + regs->num_regs = num_regs; + regs->start = starts; + regs->end = ends; + } + else + { + bufp->regs_allocated = REGS_UNALLOCATED; + regs->num_regs = 0; + regs->start = regs->end = 0; + } + } + + /* Searching routines. */ + + /* Like re_search_2, below, but only one string is specified, and + doesn't let you say where to stop matching. */ + + int + re_search (bufp, string, size, startpos, range, regs) + struct re_pattern_buffer *bufp; + const char *string; + int size, startpos, range; + struct re_registers *regs; + { + return re_search_2 (bufp, NULL, 0, string, size, startpos, range, + regs, size); + } + + + /* Using the compiled pattern in BUFP->buffer, first tries to match the + virtual concatenation of STRING1 and STRING2, starting first at index + STARTPOS, then at STARTPOS + 1, and so on. + + STRING1 and STRING2 have length SIZE1 and SIZE2, respectively. + + RANGE is how far to scan while trying to match. RANGE = 0 means try + only at STARTPOS; in general, the last start tried is STARTPOS + + RANGE. + + In REGS, return the indices of the virtual concatenation of STRING1 + and STRING2 that matched the entire BUFP->buffer and its contained + subexpressions. + + Do not consider matching one past the index STOP in the virtual + concatenation of STRING1 and STRING2. + + We return either the position in the strings at which the match was + found, -1 if no match, or -2 if error (such as failure + stack overflow). */ + + int + re_search_2 (bufp, string1, size1, string2, size2, startpos, range, regs, stop) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int size1, size2; + int startpos; + int range; + struct re_registers *regs; + int stop; + { + int val; + register char *fastmap = bufp->fastmap; + register char *translate = bufp->translate; + int total_size = size1 + size2; + int endpos = startpos + range; + + /* Check for out-of-range STARTPOS. */ + if (startpos < 0 || startpos > total_size) + return -1; + + /* Fix up RANGE if it might eventually take us outside + the virtual concatenation of STRING1 and STRING2. */ + if (endpos < -1) + range = -1 - startpos; + else if (endpos > total_size) + range = total_size - startpos; + + /* If the search isn't to be a backwards one, don't waste time in a + search for a pattern that must be anchored. */ + if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf && range > 0) + { + if (startpos > 0) + return -1; + else + range = 1; + } + + /* Update the fastmap now if not correct already. */ + if (fastmap && !bufp->fastmap_accurate) + if (re_compile_fastmap (bufp) == -2) + return -2; + + /* Loop through the string, looking for a place to start matching. */ + for (;;) + { + /* If a fastmap is supplied, skip quickly over characters that + cannot be the start of a match. If the pattern can match the + null string, however, we don't need to skip characters; we want + the first null string. */ + if (fastmap && startpos < total_size && !bufp->can_be_null) + { + if (range > 0) /* Searching forwards. */ + { + register const char *d; + register int lim = 0; + int irange = range; + + if (startpos < size1 && startpos + range >= size1) + lim = range - (size1 - startpos); + + d = (startpos >= size1 ? string2 - size1 : string1) + startpos; + + /* Written out as an if-else to avoid testing `translate' + inside the loop. */ + if (translate) + while (range > lim + && !fastmap[(unsigned char) + translate[(unsigned char) *d++]]) + range--; + else + while (range > lim && !fastmap[(unsigned char) *d++]) + range--; + + startpos += irange - range; + } + else /* Searching backwards. */ + { + register char c = (size1 == 0 || startpos >= size1 + ? string2[startpos - size1] + : string1[startpos]); + + if (!fastmap[(unsigned char) TRANSLATE (c)]) + goto advance; + } + } + + /* If can't match the null string, and that's all we have left, fail. */ + if (range >= 0 && startpos == total_size && fastmap + && !bufp->can_be_null) + return -1; + + val = re_match_2 (bufp, string1, size1, string2, size2, + startpos, regs, stop); + if (val >= 0) + return startpos; + + if (val == -2) + return -2; + + advance: + if (!range) + break; + else if (range > 0) + { + range--; + startpos++; + } + else + { + range++; + startpos--; + } + } + return -1; + } /* re_search_2 */ + + /* Structure for per-register (a.k.a. per-group) information. + This must not be longer than one word, because we push this value + onto the failure stack. Other register information, such as the + starting and ending positions (which are addresses), and the list of + inner groups (which is a bits list) are maintained in separate + variables. + + We are making a (strictly speaking) nonportable assumption here: that + the compiler will pack our bit fields into something that fits into + the type of `word', i.e., is something that fits into one item on the + failure stack. */ + + /* Declarations and macros for re_match_2. */ + + typedef union + { + fail_stack_elt_t word; + struct + { + /* This field is one if this group can match the empty string, + zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ + #define MATCH_NULL_UNSET_VALUE 3 + unsigned match_null_string_p : 2; + unsigned is_active : 1; + unsigned matched_something : 1; + unsigned ever_matched_something : 1; + } bits; + } register_info_type; + + #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) + #define IS_ACTIVE(R) ((R).bits.is_active) + #define MATCHED_SOMETHING(R) ((R).bits.matched_something) + #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) + + static boolean group_match_null_string_p _RE_ARGS((unsigned char **p, + unsigned char *end, + register_info_type *reg_info)); + static boolean alt_match_null_string_p _RE_ARGS((unsigned char *p, + unsigned char *end, + register_info_type *reg_info)); + static boolean common_op_match_null_string_p _RE_ARGS((unsigned char **p, + unsigned char *end, + register_info_type *reg_info)); + static int bcmp_translate _RE_ARGS((const char *s1, const char *s2, + int len, char *translate)); + + /* Call this when have matched a real character; it sets `matched' flags + for the subexpressions which we are currently inside. Also records + that those subexprs have matched. */ + #define SET_REGS_MATCHED() \ + do \ + { \ + active_reg_t r; \ + for (r = lowest_active_reg; r <= highest_active_reg; r++) \ + { \ + MATCHED_SOMETHING (reg_info[r]) \ + = EVER_MATCHED_SOMETHING (reg_info[r]) \ + = 1; \ + } \ + } \ + while (0) + + + /* This converts PTR, a pointer into one of the search strings `string1' + and `string2' into an offset from the beginning of that string. */ + #define POINTER_TO_OFFSET(ptr) \ + (FIRST_STRING_P (ptr) ? (ptr) - string1 : (ptr) - string2 + size1) + + /* Registers are set to a sentinel when they haven't yet matched. */ + #define REG_UNSET_VALUE ((char *) -1) + #define REG_UNSET(e) ((e) == REG_UNSET_VALUE) + + + /* Macros for dealing with the split strings in re_match_2. */ + + #define MATCHING_IN_FIRST_STRING (dend == end_match_1) + + /* Call before fetching a character with *d. This switches over to + string2 if necessary. */ + #define PREFETCH() \ + while (d == dend) \ + { \ + /* End of string2 => fail. */ \ + if (dend == end_match_2) \ + goto fail; \ + /* End of string1 => advance to string2. */ \ + d = string2; \ + dend = end_match_2; \ + } + + + /* Test if at very beginning or at very end of the virtual concatenation + of `string1' and `string2'. If only one string, it's `string2'. */ + #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2) + #define AT_STRINGS_END(d) ((d) == end2) + + + /* Test if D points to a character which is word-constituent. We have + two special cases to check for: if past the end of string1, look at + the first character in string2; and if before the beginning of + string2, look at the last character in string1. */ + #define WORDCHAR_P(d) \ + (SYNTAX ((d) == end1 ? *string2 \ + : (d) == string2 - 1 ? *(end1 - 1) : *(d)) \ + == Sword) + + /* Test if the character before D and the one at D differ with respect + to being word-constituent. */ + #define AT_WORD_BOUNDARY(d) \ + (AT_STRINGS_BEG (d) || AT_STRINGS_END (d) \ + || WORDCHAR_P (d - 1) != WORDCHAR_P (d)) + + + /* Free everything we malloc. */ + #ifdef REGEX_MALLOC + #define FREE_VAR(var) if (var) free (var); var = NULL + #define FREE_VARIABLES() \ + do { \ + FREE_VAR (fail_stack.stack); \ + FREE_VAR (regstart); \ + FREE_VAR (regend); \ + FREE_VAR (old_regstart); \ + FREE_VAR (old_regend); \ + FREE_VAR (best_regstart); \ + FREE_VAR (best_regend); \ + FREE_VAR (reg_info); \ + FREE_VAR (reg_dummy); \ + FREE_VAR (reg_info_dummy); \ + } while (0) + #else /* not REGEX_MALLOC */ + /* Some MIPS systems (at least) want this to free alloca'd storage. */ + #define FREE_VARIABLES() alloca (0) + #endif /* not REGEX_MALLOC */ + + + /* These values must meet several constraints. They must not be valid + register values; since we have a limit of 255 registers (because + we use only one byte in the pattern for the register number), we can + use numbers larger than 255. They must differ by 1, because of + NUM_FAILURE_ITEMS above. And the value for the lowest register must + be larger than the value for the highest register, so we do not try + to actually save any registers when none are active. */ + #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH) + #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1) + + /* Matching routines. */ + + #ifndef emacs /* Emacs never uses this. */ + /* re_match is like re_match_2 except it takes only a single string. */ + + int + re_match (bufp, string, size, pos, regs) + struct re_pattern_buffer *bufp; + const char *string; + int size, pos; + struct re_registers *regs; + { + return re_match_2 (bufp, NULL, 0, string, size, pos, regs, size); + } + #endif /* not emacs */ + + + /* re_match_2 matches the compiled pattern in BUFP against the + the (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 + and SIZE2, respectively). We start matching at POS, and stop + matching at STOP. + + If REGS is non-null and the `no_sub' field of BUFP is nonzero, we + store offsets for the substring each group matched in REGS. See the + documentation for exactly how many groups we fill. + + We return -1 if no match, -2 if an internal error (such as the + failure stack overflowing). Otherwise, we return the length of the + matched substring. */ + + int + re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) + struct re_pattern_buffer *bufp; + const char *string1, *string2; + int size1, size2; + int pos; + struct re_registers *regs; + int stop; + { + /* General temporaries. */ + int mcnt; + unsigned char *p1; + + /* Just past the end of the corresponding string. */ + const char *end1, *end2; + + /* Pointers into string1 and string2, just past the last characters in + each to consider matching. */ + const char *end_match_1, *end_match_2; + + /* Where we are in the data, and the end of the current string. */ + const char *d, *dend; + + /* Where we are in the pattern, and the end of the pattern. */ + unsigned char *p = bufp->buffer; + register unsigned char *pend = p + bufp->used; + + /* We use this to map every character in the string. */ + char *translate = bufp->translate; + + /* Failure point stack. Each place that can handle a failure further + down the line pushes a failure point on this stack. It consists of + restart, regend, and reg_info for all registers corresponding to + the subexpressions we're currently inside, plus the number of such + registers, and, finally, two char *'s. The first char * is where + to resume scanning the pattern; the second one is where to resume + scanning the strings. If the latter is zero, the failure point is + a ``dummy''; if a failure happens and the failure point is a dummy, + it gets discarded and the next next one is tried. */ + fail_stack_type fail_stack; + #ifdef DEBUG + static unsigned failure_id = 0; + unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0; + #endif + + /* We fill all the registers internally, independent of what we + return, for use in backreferences. The number here includes + an element for register zero. */ + size_t num_regs = bufp->re_nsub + 1; + + /* The currently active registers. */ + active_reg_t lowest_active_reg = NO_LOWEST_ACTIVE_REG; + active_reg_t highest_active_reg = NO_HIGHEST_ACTIVE_REG; + + /* Information on the contents of registers. These are pointers into + the input strings; they record just what was matched (on this + attempt) by a subexpression part of the pattern, that is, the + regnum-th regstart pointer points to where in the pattern we began + matching and the regnum-th regend points to right after where we + stopped matching the regnum-th subexpression. (The zeroth register + keeps track of what the whole pattern matches.) */ + const char **regstart = 0, **regend = 0; + + /* If a group that's operated upon by a repetition operator fails to + match anything, then the register for its start will need to be + restored because it will have been set to wherever in the string we + are when we last see its open-group operator. Similarly for a + register's end. */ + const char **old_regstart = 0, **old_regend = 0; + + /* The is_active field of reg_info helps us keep track of which (possibly + nested) subexpressions we are currently in. The matched_something + field of reg_info[reg_num] helps us tell whether or not we have + matched any of the pattern so far this time through the reg_num-th + subexpression. These two fields get reset each time through any + loop their register is in. */ + register_info_type *reg_info = 0; + + /* The following record the register info as found in the above + variables when we find a match better than any we've seen before. + This happens as we backtrack through the failure points, which in + turn happens only if we have not yet matched the entire string. */ + unsigned best_regs_set = false; + const char **best_regstart = 0, **best_regend = 0; + + /* Logically, this is `best_regend[0]'. But we don't want to have to + allocate space for that if we're not allocating space for anything + else (see below). Also, we never need info about register 0 for + any of the other register vectors, and it seems rather a kludge to + treat `best_regend' differently than the rest. So we keep track of + the end of the best match so far in a separate variable. We + initialize this to NULL so that when we backtrack the first time + and need to test it, it's not garbage. */ + const char *match_end = NULL; + + /* Used when we pop values we don't care about. */ + const char **reg_dummy = 0; + register_info_type *reg_info_dummy = 0; + + #ifdef DEBUG + /* Counts the total number of registers pushed. */ + unsigned num_regs_pushed = 0; + #endif + + DEBUG_PRINT1 ("\n\nEntering re_match_2.\n"); + + INIT_FAIL_STACK (); + + /* Do not bother to initialize all the register variables if there are + no groups in the pattern, as it takes a fair amount of time. If + there are groups, we include space for register 0 (the whole + pattern), even though we never use it, since it simplifies the + array indexing. We should fix this. */ + if (bufp->re_nsub) + { + regstart = REGEX_TALLOC (num_regs, const char *); + regend = REGEX_TALLOC (num_regs, const char *); + old_regstart = REGEX_TALLOC (num_regs, const char *); + old_regend = REGEX_TALLOC (num_regs, const char *); + best_regstart = REGEX_TALLOC (num_regs, const char *); + best_regend = REGEX_TALLOC (num_regs, const char *); + reg_info = REGEX_TALLOC (num_regs, register_info_type); + reg_dummy = REGEX_TALLOC (num_regs, const char *); + reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type); + + if (!(regstart && regend && old_regstart && old_regend && reg_info + && best_regstart && best_regend && reg_dummy && reg_info_dummy)) + { + FREE_VARIABLES (); + return -2; + } + } + #ifdef REGEX_MALLOC + else + { + /* We must initialize all our variables to NULL, so that + `FREE_VARIABLES' doesn't try to free them. */ + regstart = regend = old_regstart = old_regend = best_regstart + = best_regend = reg_dummy = NULL; + reg_info = reg_info_dummy = (register_info_type *) NULL; + } + #endif /* REGEX_MALLOC */ + + /* The starting position is bogus. */ + if (pos < 0 || pos > size1 + size2) + { + FREE_VARIABLES (); + return -1; + } + + /* Initialize subexpression text positions to -1 to mark ones that no + start_memory/stop_memory has been seen for. Also initialize the + register information struct. */ + for (mcnt = 1; mcnt < num_regs; mcnt++) + { + regstart[mcnt] = regend[mcnt] + = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE; + + REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE; + IS_ACTIVE (reg_info[mcnt]) = 0; + MATCHED_SOMETHING (reg_info[mcnt]) = 0; + EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0; + } + + /* We move `string1' into `string2' if the latter's empty -- but not if + `string1' is null. */ + if (size2 == 0 && string1 != NULL) + { + string2 = string1; + size2 = size1; + string1 = 0; + size1 = 0; + } + end1 = string1 + size1; + end2 = string2 + size2; + + /* Compute where to stop matching, within the two strings. */ + if (stop <= size1) + { + end_match_1 = string1 + stop; + end_match_2 = string2; + } + else + { + end_match_1 = end1; + end_match_2 = string2 + stop - size1; + } + + /* `p' scans through the pattern as `d' scans through the data. + `dend' is the end of the input string that `d' points within. `d' + is advanced into the following input string whenever necessary, but + this happens before fetching; therefore, at the beginning of the + loop, `d' can be pointing at the end of a string, but it cannot + equal `string2'. */ + if (size1 > 0 && pos <= size1) + { + d = string1 + pos; + dend = end_match_1; + } + else + { + d = string2 + pos - size1; + dend = end_match_2; + } + + DEBUG_PRINT1 ("The compiled pattern is: "); + DEBUG_PRINT_COMPILED_PATTERN (bufp, p, pend); + DEBUG_PRINT1 ("The string to match is: `"); + DEBUG_PRINT_DOUBLE_STRING (d, string1, size1, string2, size2); + DEBUG_PRINT1 ("'\n"); + + /* This loops over pattern commands. It exits by returning from the + function if the match is complete, or it drops through if the match + fails at this starting point in the input data. */ + for (;;) + { + DEBUG_PRINT2 ("\n0x%x: ", p); + + if (p == pend) + { /* End of pattern means we might have succeeded. */ + DEBUG_PRINT1 ("end of pattern ... "); + + /* If we haven't matched the entire string, and we want the + longest match, try backtracking. */ + if (d != end_match_2) + { + DEBUG_PRINT1 ("backtracking.\n"); + + if (!FAIL_STACK_EMPTY ()) + { /* More failure points to try. */ + boolean same_str_p = (FIRST_STRING_P (match_end) + == MATCHING_IN_FIRST_STRING); + + /* If exceeds best match so far, save it. */ + if (!best_regs_set + || (same_str_p && d > match_end) + || (!same_str_p && !MATCHING_IN_FIRST_STRING)) + { + best_regs_set = true; + match_end = d; + + DEBUG_PRINT1 ("\nSAVING match as best so far.\n"); + + for (mcnt = 1; mcnt < num_regs; mcnt++) + { + best_regstart[mcnt] = regstart[mcnt]; + best_regend[mcnt] = regend[mcnt]; + } + } + goto fail; + } + + /* If no failure points, don't restore garbage. */ + else if (best_regs_set) + { + restore_best_regs: + /* Restore best match. It may happen that `dend == + end_match_1' while the restored d is in string2. + For example, the pattern `x.*y.*z' against the + strings `x-' and `y-z-', if the two strings are + not consecutive in memory. */ + DEBUG_PRINT1 ("Restoring best registers.\n"); + + d = match_end; + dend = ((d >= string1 && d <= end1) + ? end_match_1 : end_match_2); + + for (mcnt = 1; mcnt < num_regs; mcnt++) + { + regstart[mcnt] = best_regstart[mcnt]; + regend[mcnt] = best_regend[mcnt]; + } + } + } /* d != end_match_2 */ + + DEBUG_PRINT1 ("Accepting match.\n"); + + /* If caller wants register contents data back, do it. */ + if (regs && !bufp->no_sub) + { + /* Have the register data arrays been allocated? */ + if (bufp->regs_allocated == REGS_UNALLOCATED) + { /* No. So allocate them with malloc. We need one + extra element beyond `num_regs' for the `-1' marker + GNU code uses. */ + regs->num_regs = MAX (RE_NREGS, num_regs + 1); + regs->start = TALLOC (regs->num_regs, regoff_t); + regs->end = TALLOC (regs->num_regs, regoff_t); + if (regs->start == NULL || regs->end == NULL) + return -2; + bufp->regs_allocated = REGS_REALLOCATE; + } + else if (bufp->regs_allocated == REGS_REALLOCATE) + { /* Yes. If we need more elements than were already + allocated, reallocate them. If we need fewer, just + leave it alone. */ + if (regs->num_regs < num_regs + 1) + { + regs->num_regs = num_regs + 1; + RETALLOC (regs->start, regs->num_regs, regoff_t); + RETALLOC (regs->end, regs->num_regs, regoff_t); + if (regs->start == NULL || regs->end == NULL) + return -2; + } + } + else + { + /* These braces fend off a "empty body in an else-statement" + warning under GCC when assert expands to nothing. */ + assert (bufp->regs_allocated == REGS_FIXED); + } + + /* Convert the pointer data in `regstart' and `regend' to + indices. Register zero has to be set differently, + since we haven't kept track of any info for it. */ + if (regs->num_regs > 0) + { + regs->start[0] = pos; + regs->end[0] = (MATCHING_IN_FIRST_STRING ? d - string1 + : d - string2 + size1); + } + + /* Go through the first `min (num_regs, regs->num_regs)' + registers, since that is all we initialized. */ + for (mcnt = 1; mcnt < MIN (num_regs, regs->num_regs); mcnt++) + { + if (REG_UNSET (regstart[mcnt]) || REG_UNSET (regend[mcnt])) + regs->start[mcnt] = regs->end[mcnt] = -1; + else + { + regs->start[mcnt] = POINTER_TO_OFFSET (regstart[mcnt]); + regs->end[mcnt] = POINTER_TO_OFFSET (regend[mcnt]); + } + } + + /* If the regs structure we return has more elements than + were in the pattern, set the extra elements to -1. If + we (re)allocated the registers, this is the case, + because we always allocate enough to have at least one + -1 at the end. */ + for (mcnt = num_regs; mcnt < regs->num_regs; mcnt++) + regs->start[mcnt] = regs->end[mcnt] = -1; + } /* regs && !bufp->no_sub */ + + FREE_VARIABLES (); + DEBUG_PRINT4 ("%u failure points pushed, %u popped (%u remain).\n", + nfailure_points_pushed, nfailure_points_popped, + nfailure_points_pushed - nfailure_points_popped); + DEBUG_PRINT2 ("%u registers pushed.\n", num_regs_pushed); + + mcnt = d - pos - (MATCHING_IN_FIRST_STRING + ? string1 + : string2 - size1); + + DEBUG_PRINT2 ("Returning %d from re_match_2.\n", mcnt); + + return mcnt; + } + + /* Otherwise match next pattern command. */ + #ifdef SWITCH_ENUM_BUG + switch ((int) ((re_opcode_t) *p++)) + #else + switch ((re_opcode_t) *p++) + #endif + { + /* Ignore these. Used to ignore the n of succeed_n's which + currently have n == 0. */ + case no_op: + DEBUG_PRINT1 ("EXECUTING no_op.\n"); + break; + + + /* Match the next n pattern characters exactly. The following + byte in the pattern defines n, and the n bytes after that + are the characters to match. */ + case exactn: + mcnt = *p++; + DEBUG_PRINT2 ("EXECUTING exactn %d.\n", mcnt); + + /* This is written out as an if-else so we don't waste time + testing `translate' inside the loop. */ + if (translate) + { + do + { + PREFETCH (); + if (translate[(unsigned char) *d++] != (char) *p++) + goto fail; + } + while (--mcnt); + } + else + { + do + { + PREFETCH (); + if (*d++ != (char) *p++) goto fail; + } + while (--mcnt); + } + SET_REGS_MATCHED (); + break; + + + /* Match any character except possibly a newline or a null. */ + case anychar: + DEBUG_PRINT1 ("EXECUTING anychar.\n"); + + PREFETCH (); + + if ((!(bufp->syntax & RE_DOT_NEWLINE) && TRANSLATE (*d) == '\n') + || (bufp->syntax & RE_DOT_NOT_NULL && TRANSLATE (*d) == '\000')) + goto fail; + + SET_REGS_MATCHED (); + DEBUG_PRINT2 (" Matched `%d'.\n", *d); + d++; + break; + + + case charset: + case charset_not: + { + register unsigned char c; + boolean not = (re_opcode_t) *(p - 1) == charset_not; + + DEBUG_PRINT2 ("EXECUTING charset%s.\n", not ? "_not" : ""); + + PREFETCH (); + c = TRANSLATE (*d); /* The character to match. */ + + /* Cast to `unsigned' instead of `unsigned char' in case the + bit list is a full 32 bytes long. */ + if (c < (unsigned) (*p * BYTEWIDTH) + && p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) + not = !not; + + p += 1 + *p; + + if (!not) goto fail; + + SET_REGS_MATCHED (); + d++; + break; + } + + + /* The beginning of a group is represented by start_memory. + The arguments are the register number in the next byte, and the + number of groups inner to this one in the next. The text + matched within the group is recorded (in the internal + registers data structure) under the register number. */ + case start_memory: + DEBUG_PRINT3 ("EXECUTING start_memory %d (%d):\n", *p, p[1]); + + /* Find out if this group can match the empty string. */ + p1 = p; /* To send to group_match_null_string_p. */ + + if (REG_MATCH_NULL_STRING_P (reg_info[*p]) == MATCH_NULL_UNSET_VALUE) + REG_MATCH_NULL_STRING_P (reg_info[*p]) + = group_match_null_string_p (&p1, pend, reg_info); + + /* Save the position in the string where we were the last time + we were at this open-group operator in case the group is + operated upon by a repetition operator, e.g., with `(a*)*b' + against `ab'; then we want to ignore where we are now in + the string in case this attempt to match fails. */ + old_regstart[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) + ? REG_UNSET (regstart[*p]) ? d : regstart[*p] + : regstart[*p]; + DEBUG_PRINT2 (" old_regstart: %d\n", + POINTER_TO_OFFSET (old_regstart[*p])); + + regstart[*p] = d; + DEBUG_PRINT2 (" regstart: %d\n", POINTER_TO_OFFSET (regstart[*p])); + + IS_ACTIVE (reg_info[*p]) = 1; + MATCHED_SOMETHING (reg_info[*p]) = 0; + + /* This is the new highest active register. */ + highest_active_reg = *p; + + /* If nothing was active before, this is the new lowest active + register. */ + if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) + lowest_active_reg = *p; + + /* Move past the register number and inner group count. */ + p += 2; + break; + + + /* The stop_memory opcode represents the end of a group. Its + arguments are the same as start_memory's: the register + number, and the number of inner groups. */ + case stop_memory: + DEBUG_PRINT3 ("EXECUTING stop_memory %d (%d):\n", *p, p[1]); + + /* We need to save the string position the last time we were at + this close-group operator in case the group is operated + upon by a repetition operator, e.g., with `((a*)*(b*)*)*' + against `aba'; then we want to ignore where we are now in + the string in case this attempt to match fails. */ + old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p]) + ? REG_UNSET (regend[*p]) ? d : regend[*p] + : regend[*p]; + DEBUG_PRINT2 (" old_regend: %d\n", + POINTER_TO_OFFSET (old_regend[*p])); + + regend[*p] = d; + DEBUG_PRINT2 (" regend: %d\n", POINTER_TO_OFFSET (regend[*p])); + + /* This register isn't active anymore. */ + IS_ACTIVE (reg_info[*p]) = 0; + + /* If this was the only register active, nothing is active + anymore. */ + if (lowest_active_reg == highest_active_reg) + { + lowest_active_reg = NO_LOWEST_ACTIVE_REG; + highest_active_reg = NO_HIGHEST_ACTIVE_REG; + } + else + { /* We must scan for the new highest active register, since + it isn't necessarily one less than now: consider + (a(b)c(d(e)f)g). When group 3 ends, after the f), the + new highest active register is 1. */ + unsigned char r = *p - 1; + while (r > 0 && !IS_ACTIVE (reg_info[r])) + r--; + + /* If we end up at register zero, that means that we saved + the registers as the result of an `on_failure_jump', not + a `start_memory', and we jumped to past the innermost + `stop_memory'. For example, in ((.)*) we save + registers 1 and 2 as a result of the *, but when we pop + back to the second ), we are at the stop_memory 1. + Thus, nothing is active. */ + if (r == 0) + { + lowest_active_reg = NO_LOWEST_ACTIVE_REG; + highest_active_reg = NO_HIGHEST_ACTIVE_REG; + } + else + highest_active_reg = r; + } + + /* If just failed to match something this time around with a + group that's operated on by a repetition operator, try to + force exit from the ``loop'', and restore the register + information for this group that we had before trying this + last match. */ + if ((!MATCHED_SOMETHING (reg_info[*p]) + || (re_opcode_t) p[-3] == start_memory) + && (p + 2) < pend) + { + boolean is_a_jump_n = false; + + p1 = p + 2; + mcnt = 0; + switch ((re_opcode_t) *p1++) + { + case jump_n: + is_a_jump_n = true; + case pop_failure_jump: + case maybe_pop_jump: + case jump: + case dummy_failure_jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if (is_a_jump_n) + p1 += 2; + break; + + default: + /* do nothing */ ; + } + p1 += mcnt; + + /* If the next operation is a jump backwards in the pattern + to an on_failure_jump right before the start_memory + corresponding to this stop_memory, exit from the loop + by forcing a failure after pushing on the stack the + on_failure_jump's jump in the pattern, and d. */ + if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump + && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) + { + /* If this group ever matched anything, then restore + what its registers were before trying this last + failed match, e.g., with `(a*)*b' against `ab' for + regstart[1], and, e.g., with `((a*)*(b*)*)*' + against `aba' for regend[3]. + + Also restore the registers for inner groups for, + e.g., `((a*)(b*))*' against `aba' (register 3 would + otherwise get trashed). */ + + if (EVER_MATCHED_SOMETHING (reg_info[*p])) + { + unsigned r; + + EVER_MATCHED_SOMETHING (reg_info[*p]) = 0; + + /* Restore this and inner groups' (if any) registers. */ + for (r = *p; r < *p + *(p + 1); r++) + { + regstart[r] = old_regstart[r]; + + /* xx why this test? */ + if ((s_reg_t) old_regend[r] >= (s_reg_t) regstart[r]) + regend[r] = old_regend[r]; + } + } + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + PUSH_FAILURE_POINT (p1 + mcnt, d, -2); + PUSH_FAILURE_POINT2(p1 + mcnt, d, -2); + + goto fail; + } + } + + /* Move past the register number and the inner group count. */ + p += 2; + break; + + + /* \ has been turned into a `duplicate' command which is + followed by the numeric value of as the register number. */ + case duplicate: + { + register const char *d2, *dend2; + int regno = *p++; /* Get which register to match against. */ + DEBUG_PRINT2 ("EXECUTING duplicate %d.\n", regno); + + /* Can't back reference a group which we've never matched. */ + if (REG_UNSET (regstart[regno]) || REG_UNSET (regend[regno])) + goto fail; + + /* Where in input to try to start matching. */ + d2 = regstart[regno]; + + /* Where to stop matching; if both the place to start and + the place to stop matching are in the same string, then + set to the place to stop, otherwise, for now have to use + the end of the first string. */ + + dend2 = ((FIRST_STRING_P (regstart[regno]) + == FIRST_STRING_P (regend[regno])) + ? regend[regno] : end_match_1); + for (;;) + { + /* If necessary, advance to next segment in register + contents. */ + while (d2 == dend2) + { + if (dend2 == end_match_2) break; + if (dend2 == regend[regno]) break; + + /* End of string1 => advance to string2. */ + d2 = string2; + dend2 = regend[regno]; + } + /* At end of register contents => success */ + if (d2 == dend2) break; + + /* If necessary, advance to next segment in data. */ + PREFETCH (); + + /* How many characters left in this segment to match. */ + mcnt = dend - d; + + /* Want how many consecutive characters we can match in + one shot, so, if necessary, adjust the count. */ + if (mcnt > dend2 - d2) + mcnt = dend2 - d2; + + /* Compare that many; failure if mismatch, else move + past them. */ + if (translate + ? bcmp_translate (d, d2, mcnt, translate) + : bcmp (d, d2, mcnt)) + goto fail; + d += mcnt, d2 += mcnt; + } + } + break; + + + /* begline matches the empty string at the beginning of the string + (unless `not_bol' is set in `bufp'), and, if + `newline_anchor' is set, after newlines. */ + case begline: + DEBUG_PRINT1 ("EXECUTING begline.\n"); + + if (AT_STRINGS_BEG (d)) + { + if (!bufp->not_bol) break; + } + else if (d[-1] == '\n' && bufp->newline_anchor) + { + break; + } + /* In all other cases, we fail. */ + goto fail; + + + /* endline is the dual of begline. */ + case endline: + DEBUG_PRINT1 ("EXECUTING endline.\n"); + + if (AT_STRINGS_END (d)) + { + if (!bufp->not_eol) break; + } + + /* We have to ``prefetch'' the next character. */ + else if ((d == end1 ? *string2 : *d) == '\n' + && bufp->newline_anchor) + { + break; + } + goto fail; + + + /* Match at the very beginning of the data. */ + case begbuf: + DEBUG_PRINT1 ("EXECUTING begbuf.\n"); + if (AT_STRINGS_BEG (d)) + break; + goto fail; + + + /* Match at the very end of the data. */ + case endbuf: + DEBUG_PRINT1 ("EXECUTING endbuf.\n"); + if (AT_STRINGS_END (d)) + break; + goto fail; + + + /* on_failure_keep_string_jump is used to optimize `.*\n'. It + pushes NULL as the value for the string on the stack. Then + `pop_failure_point' will keep the current value for the + string, instead of restoring it. To see why, consider + matching `foo\nbar' against `.*\n'. The .* matches the foo; + then the . fails against the \n. But the next thing we want + to do is match the \n against the \n; if we restored the + string value, we would be back at the foo. + + Because this is used only in specific cases, we don't need to + check all the things that `on_failure_jump' does, to make + sure the right things get saved on the stack. Hence we don't + share its code. The only reason to push anything on the + stack at all is that otherwise we would have to change + `anychar's code to do something besides goto fail in this + case; that seems worse than this. */ + case on_failure_keep_string_jump: + DEBUG_PRINT1 ("EXECUTING on_failure_keep_string_jump"); + + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 (" %d (to 0x%x):\n", mcnt, p + mcnt); + + PUSH_FAILURE_POINT (p + mcnt, NULL, -2); + PUSH_FAILURE_POINT2(p + mcnt, NULL, -2); + break; + + + /* Uses of on_failure_jump: + + Each alternative starts with an on_failure_jump that points + to the beginning of the next alternative. Each alternative + except the last ends with a jump that in effect jumps past + the rest of the alternatives. (They really jump to the + ending jump of the following alternative, because tensioning + these jumps is a hassle.) + + Repeats start with an on_failure_jump that points past both + the repetition text and either the following jump or + pop_failure_jump back to this on_failure_jump. */ + case on_failure_jump: + on_failure: + DEBUG_PRINT1 ("EXECUTING on_failure_jump"); + + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 (" %d (to 0x%x)", mcnt, p + mcnt); + + /* If this on_failure_jump comes right before a group (i.e., + the original * applied to a group), save the information + for that group and all inner ones, so that if we fail back + to this point, the group's information will be correct. + For example, in \(a*\)*\1, we need the preceding group, + and in \(\(a*\)b*\)\2, we need the inner group. */ + + /* We can't use `p' to check ahead because we push + a failure point to `p + mcnt' after we do this. */ + p1 = p; + + /* We need to skip no_op's before we look for the + start_memory in case this on_failure_jump is happening as + the result of a completed succeed_n, as in \(a\)\{1,3\}b\1 + against aba. */ + while (p1 < pend && (re_opcode_t) *p1 == no_op) + p1++; + + if (p1 < pend && (re_opcode_t) *p1 == start_memory) + { + /* We have a new highest active register now. This will + get reset at the start_memory we are about to get to, + but we will have saved all the registers relevant to + this repetition op, as described above. */ + highest_active_reg = *(p1 + 1) + *(p1 + 2); + if (lowest_active_reg == NO_LOWEST_ACTIVE_REG) + lowest_active_reg = *(p1 + 1); + } + + DEBUG_PRINT1 (":\n"); + PUSH_FAILURE_POINT (p + mcnt, d, -2); + PUSH_FAILURE_POINT2(p + mcnt, d, -2); + break; + + + /* A smart repeat ends with `maybe_pop_jump'. + We change it to either `pop_failure_jump' or `jump'. */ + case maybe_pop_jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT2 ("EXECUTING maybe_pop_jump %d.\n", mcnt); + { + register unsigned char *p2 = p; + + /* Compare the beginning of the repeat with what in the + pattern follows its end. If we can establish that there + is nothing that they would both match, i.e., that we + would have to backtrack because of (as in, e.g., `a*a') + then we can change to pop_failure_jump, because we'll + never have to backtrack. + + This is not true in the case of alternatives: in + `(a|ab)*' we do need to backtrack to the `ab' alternative + (e.g., if the string was `ab'). But instead of trying to + detect that here, the alternative has put on a dummy + failure point which is what we will end up popping. */ + + /* Skip over open/close-group commands. */ + while (p2 + 2 < pend + && ((re_opcode_t) *p2 == stop_memory + || (re_opcode_t) *p2 == start_memory)) + p2 += 3; /* Skip over args, too. */ + + /* If we're at the end of the pattern, we can change. */ + if (p2 == pend) + { + /* Consider what happens when matching ":\(.*\)" + against ":/". I don't really understand this code + yet. */ + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1 + (" End of pattern: change to `pop_failure_jump'.\n"); + } + + else if ((re_opcode_t) *p2 == exactn + || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) + { + register unsigned char c + = *p2 == (unsigned char) endline ? '\n' : p2[2]; + p1 = p + mcnt; + + /* p1[0] ... p1[2] are the `on_failure_jump' corresponding + to the `maybe_finalize_jump' of this case. Examine what + follows. */ + if ((re_opcode_t) p1[3] == exactn && p1[5] != c) + { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT3 (" %c != %c => pop_failure_jump.\n", + c, p1[5]); + } + + else if ((re_opcode_t) p1[3] == charset + || (re_opcode_t) p1[3] == charset_not) + { + int not = (re_opcode_t) p1[3] == charset_not; + + if (c < (unsigned char) (p1[4] * BYTEWIDTH) + && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))) + not = !not; + + /* `not' is equal to 1 if c would match, which means + that we can't change to pop_failure_jump. */ + if (!not) + { + p[-3] = (unsigned char) pop_failure_jump; + DEBUG_PRINT1 (" No match => pop_failure_jump.\n"); + } + } + } + } + p -= 2; /* Point at relative address again. */ + if ((re_opcode_t) p[-1] != pop_failure_jump) + { + p[-1] = (unsigned char) jump; + DEBUG_PRINT1 (" Match => jump.\n"); + goto unconditional_jump; + } + /* Note fall through. */ + + + /* The end of a simple repeat has a pop_failure_jump back to + its matching on_failure_jump, where the latter will push a + failure point. The pop_failure_jump takes off failure + points put on by this pop_failure_jump's matching + on_failure_jump; we got through the pattern to here from the + matching on_failure_jump, so didn't fail. */ + case pop_failure_jump: + { + /* We need to pass separate storage for the lowest and + highest registers, even though we don't care about the + actual values. Otherwise, we will restore only one + register from the stack, since lowest will == highest in + `pop_failure_point'. */ + active_reg_t dummy_low_reg, dummy_high_reg; + unsigned char *pdummy; + const char *sdummy; + + DEBUG_PRINT1 ("EXECUTING pop_failure_jump.\n"); + POP_FAILURE_POINT (sdummy, pdummy, + dummy_low_reg, dummy_high_reg, + reg_dummy, reg_dummy, reg_info_dummy); + } + /* Note fall through. */ + + + /* Unconditionally jump (without popping any failure points). */ + case jump: + unconditional_jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p); /* Get the amount to jump. */ + DEBUG_PRINT2 ("EXECUTING jump %d ", mcnt); + p += mcnt; /* Do the jump. */ + DEBUG_PRINT2 ("(to 0x%x).\n", p); + break; + + + /* We need this opcode so we can detect where alternatives end + in `group_match_null_string_p' et al. */ + case jump_past_alt: + DEBUG_PRINT1 ("EXECUTING jump_past_alt.\n"); + goto unconditional_jump; + + + /* Normally, the on_failure_jump pushes a failure point, which + then gets popped at pop_failure_jump. We will end up at + pop_failure_jump, also, and with a pattern of, say, `a+', we + are skipping over the on_failure_jump, so we have to push + something meaningless for pop_failure_jump to pop. */ + case dummy_failure_jump: + DEBUG_PRINT1 ("EXECUTING dummy_failure_jump.\n"); + /* It doesn't matter what we push for the string here. What + the code at `fail' tests is the value for the pattern. */ + PUSH_FAILURE_POINT (0, 0, -2); + PUSH_FAILURE_POINT2(0, 0, -2); + goto unconditional_jump; + + + /* At the end of an alternative, we need to push a dummy failure + point in case we are followed by a `pop_failure_jump', because + we don't want the failure point for the alternative to be + popped. For example, matching `(a|ab)*' against `aab' + requires that we match the `ab' alternative. */ + case push_dummy_failure: + DEBUG_PRINT1 ("EXECUTING push_dummy_failure.\n"); + /* See comments just above at `dummy_failure_jump' about the + two zeroes. */ + PUSH_FAILURE_POINT (0, 0, -2); + PUSH_FAILURE_POINT2(0, 0, -2); + break; + + /* Have to succeed matching what follows at least n times. + After that, handle like `on_failure_jump'. */ + case succeed_n: + EXTRACT_NUMBER (mcnt, p + 2); + DEBUG_PRINT2 ("EXECUTING succeed_n %d.\n", mcnt); + + assert (mcnt >= 0); + /* Originally, this is how many times we HAVE to succeed. */ + if (mcnt > 0) + { + mcnt--; + p += 2; + STORE_NUMBER_AND_INCR (p, mcnt); + DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p, mcnt); + } + else if (mcnt == 0) + { + DEBUG_PRINT2 (" Setting two bytes from 0x%x to no_op.\n", p+2); + p[2] = (unsigned char) no_op; + p[3] = (unsigned char) no_op; + goto on_failure; + } + break; + + case jump_n: + EXTRACT_NUMBER (mcnt, p + 2); + DEBUG_PRINT2 ("EXECUTING jump_n %d.\n", mcnt); + + /* Originally, this is how many times we CAN jump. */ + if (mcnt) + { + mcnt--; + STORE_NUMBER (p + 2, mcnt); + goto unconditional_jump; + } + /* If don't have to jump any more, skip over the rest of command. */ + else + p += 4; + break; + + case set_number_at: + { + DEBUG_PRINT1 ("EXECUTING set_number_at.\n"); + + EXTRACT_NUMBER_AND_INCR (mcnt, p); + p1 = p + mcnt; + EXTRACT_NUMBER_AND_INCR (mcnt, p); + DEBUG_PRINT3 (" Setting 0x%x to %d.\n", p1, mcnt); + STORE_NUMBER (p1, mcnt); + break; + } + + case wordbound: + DEBUG_PRINT1 ("EXECUTING wordbound.\n"); + if (AT_WORD_BOUNDARY (d)) + break; + goto fail; + + case notwordbound: + DEBUG_PRINT1 ("EXECUTING notwordbound.\n"); + if (AT_WORD_BOUNDARY (d)) + goto fail; + break; + + case wordbeg: + DEBUG_PRINT1 ("EXECUTING wordbeg.\n"); + if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1))) + break; + goto fail; + + case wordend: + DEBUG_PRINT1 ("EXECUTING wordend.\n"); + if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1) + && (!WORDCHAR_P (d) || AT_STRINGS_END (d))) + break; + goto fail; + + #ifdef emacs + #ifdef emacs19 + case before_dot: + DEBUG_PRINT1 ("EXECUTING before_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) >= point) + goto fail; + break; + + case at_dot: + DEBUG_PRINT1 ("EXECUTING at_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) != point) + goto fail; + break; + + case after_dot: + DEBUG_PRINT1 ("EXECUTING after_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) <= point) + goto fail; + break; + #else /* not emacs19 */ + case at_dot: + DEBUG_PRINT1 ("EXECUTING at_dot.\n"); + if (PTR_CHAR_POS ((unsigned char *) d) + 1 != point) + goto fail; + break; + #endif /* not emacs19 */ + + case syntaxspec: + DEBUG_PRINT2 ("EXECUTING syntaxspec %d.\n", mcnt); + mcnt = *p++; + goto matchsyntax; + + case wordchar: + DEBUG_PRINT1 ("EXECUTING Emacs wordchar.\n"); + mcnt = (int) Sword; + matchsyntax: + PREFETCH (); + if (SYNTAX (*d++) != (enum syntaxcode) mcnt) + goto fail; + SET_REGS_MATCHED (); + break; + + case notsyntaxspec: + DEBUG_PRINT2 ("EXECUTING notsyntaxspec %d.\n", mcnt); + mcnt = *p++; + goto matchnotsyntax; + + case notwordchar: + DEBUG_PRINT1 ("EXECUTING Emacs notwordchar.\n"); + mcnt = (int) Sword; + matchnotsyntax: + PREFETCH (); + if (SYNTAX (*d++) == (enum syntaxcode) mcnt) + goto fail; + SET_REGS_MATCHED (); + break; + + #else /* not emacs */ + case wordchar: + DEBUG_PRINT1 ("EXECUTING non-Emacs wordchar.\n"); + PREFETCH (); + if (!WORDCHAR_P (d)) + goto fail; + SET_REGS_MATCHED (); + d++; + break; + + case notwordchar: + DEBUG_PRINT1 ("EXECUTING non-Emacs notwordchar.\n"); + PREFETCH (); + if (WORDCHAR_P (d)) + goto fail; + SET_REGS_MATCHED (); + d++; + break; + #endif /* not emacs */ + + default: + abort (); + } + continue; /* Successfully executed one pattern command; keep going. */ + + + /* We goto here if a matching operation fails. */ + fail: + if (!FAIL_STACK_EMPTY ()) + { /* A restart point is known. Restore to that state. */ + DEBUG_PRINT1 ("\nFAIL:\n"); + POP_FAILURE_POINT (d, p, + lowest_active_reg, highest_active_reg, + regstart, regend, reg_info); + + /* If this failure point is a dummy, try the next one. */ + if (!p) + goto fail; + + /* If we failed to the end of the pattern, don't examine *p. */ + assert (p <= pend); + if (p < pend) + { + boolean is_a_jump_n = false; + + /* If failed to a backwards jump that's part of a repetition + loop, need to pop this failure point and use the next one. */ + switch ((re_opcode_t) *p) + { + case jump_n: + is_a_jump_n = true; + case maybe_pop_jump: + case pop_failure_jump: + case jump: + p1 = p + 1; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + + if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n) + || (!is_a_jump_n + && (re_opcode_t) *p1 == on_failure_jump)) + goto fail; + break; + default: + /* do nothing */ ; + } + } + + if (d >= string1 && d <= end1) + dend = end_match_1; + } + else + break; /* Matching at this starting point really fails. */ + } /* for (;;) */ + + if (best_regs_set) + goto restore_best_regs; + + FREE_VARIABLES (); + + return -1; /* Failure to match. */ + } /* re_match_2 */ + + /* Subroutine definitions for re_match_2. */ + + + /* We are passed P pointing to a register number after a start_memory. + + Return true if the pattern up to the corresponding stop_memory can + match the empty string, and false otherwise. + + If we find the matching stop_memory, sets P to point to one past its number. + Otherwise, sets P to an undefined byte less than or equal to END. + + We don't handle duplicates properly (yet). */ + + static boolean + group_match_null_string_p (p, end, reg_info) + unsigned char **p, *end; + register_info_type *reg_info; + { + int mcnt; + /* Point to after the args to the start_memory. */ + unsigned char *p1 = *p + 2; + + while (p1 < end) + { + /* Skip over opcodes that can match nothing, and return true or + false, as appropriate, when we get to one that can't, or to the + matching stop_memory. */ + + switch ((re_opcode_t) *p1) + { + /* Could be either a loop or a series of alternatives. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + + /* If the next operation is not a jump backwards in the + pattern. */ + + if (mcnt >= 0) + { + /* Go through the on_failure_jumps of the alternatives, + seeing if any of the alternatives cannot match nothing. + The last alternative starts with only a jump, + whereas the rest start with on_failure_jump and end + with a jump, e.g., here is the pattern for `a|b|c': + + /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6 + /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3 + /exactn/1/c + + So, we have to first go through the first (n-1) + alternatives and then deal with the last one separately. */ + + + /* Deal with the first (n-1) alternatives, which start + with an on_failure_jump (see above) that jumps to right + past a jump_past_alt. */ + + while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) + { + /* `mcnt' holds how many bytes long the alternative + is, including the ending `jump_past_alt' and + its number. */ + + if (!alt_match_null_string_p (p1, p1 + mcnt - 3, + reg_info)) + return false; + + /* Move to right after this alternative, including the + jump_past_alt. */ + p1 += mcnt; + + /* Break if it's the beginning of an n-th alternative + that doesn't begin with an on_failure_jump. */ + if ((re_opcode_t) *p1 != on_failure_jump) + break; + + /* Still have to check that it's not an n-th + alternative that starts with an on_failure_jump. */ + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) + { + /* Get to the beginning of the n-th alternative. */ + p1 -= 3; + break; + } + } + + /* Deal with the last alternative: go back and get number + of the `jump_past_alt' just before it. `mcnt' contains + the length of the alternative. */ + EXTRACT_NUMBER (mcnt, p1 - 2); + + if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info)) + return false; + + p1 += mcnt; /* Get past the n-th alternative. */ + } /* if mcnt > 0 */ + break; + + + case stop_memory: + assert (p1[1] == **p); + *p = p1 + 2; + return true; + + + default: + if (!common_op_match_null_string_p (&p1, end, reg_info)) + return false; + } + } /* while p1 < end */ + + return false; + } /* group_match_null_string_p */ + + + /* Similar to group_match_null_string_p, but doesn't deal with alternatives: + It expects P to be the first byte of a single alternative and END one + byte past the last. The alternative can contain groups. */ + + static boolean + alt_match_null_string_p (p, end, reg_info) + unsigned char *p, *end; + register_info_type *reg_info; + { + int mcnt; + unsigned char *p1 = p; + + while (p1 < end) + { + /* Skip over opcodes that can match nothing, and break when we get + to one that can't. */ + + switch ((re_opcode_t) *p1) + { + /* It's a loop. */ + case on_failure_jump: + p1++; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + break; + + default: + if (!common_op_match_null_string_p (&p1, end, reg_info)) + return false; + } + } /* while p1 < end */ + + return true; + } /* alt_match_null_string_p */ + + + /* Deals with the ops common to group_match_null_string_p and + alt_match_null_string_p. + + Sets P to one after the op and its arguments, if any. */ + + static boolean + common_op_match_null_string_p (p, end, reg_info) + unsigned char **p, *end; + register_info_type *reg_info; + { + int mcnt; + boolean ret; + int reg_no; + unsigned char *p1 = *p; + + switch ((re_opcode_t) *p1++) + { + case no_op: + case begline: + case endline: + case begbuf: + case endbuf: + case wordbeg: + case wordend: + case wordbound: + case notwordbound: + #ifdef emacs + case before_dot: + case at_dot: + case after_dot: + #endif + break; + + case start_memory: + reg_no = *p1; + assert (reg_no > 0 && reg_no <= MAX_REGNUM); + ret = group_match_null_string_p (&p1, end, reg_info); + + /* Have to set this here in case we're checking a group which + contains a group and a back reference to it. */ + + if (REG_MATCH_NULL_STRING_P (reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE) + REG_MATCH_NULL_STRING_P (reg_info[reg_no]) = ret; + + if (!ret) + return false; + break; + + /* If this is an optimized succeed_n for zero times, make the jump. */ + case jump: + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + if (mcnt >= 0) + p1 += mcnt; + else + return false; + break; + + case succeed_n: + /* Get to the number of times to succeed. */ + p1 += 2; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + + if (mcnt == 0) + { + p1 -= 4; + EXTRACT_NUMBER_AND_INCR (mcnt, p1); + p1 += mcnt; + } + else + return false; + break; + + case duplicate: + if (!REG_MATCH_NULL_STRING_P (reg_info[*p1])) + return false; + break; + + case set_number_at: + p1 += 4; + + default: + /* All other opcodes mean we cannot match the empty string. */ + return false; + } + + *p = p1; + return true; + } /* common_op_match_null_string_p */ + + + /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN + bytes; nonzero otherwise. */ + + static int + bcmp_translate (s1, s2, len, translate) + const char *s1, *s2; + register int len; + char *translate; + { + register const unsigned char *p1 = (const unsigned char *) s1, + *p2 = (const unsigned char *) s2; + while (len) + { + if (translate[*p1++] != translate[*p2++]) return 1; + len--; + } + return 0; + } + + /* Entry points for GNU code. */ + + /* re_compile_pattern is the GNU regular expression compiler: it + compiles PATTERN (of length SIZE) and puts the result in BUFP. + Returns 0 if the pattern was valid, otherwise an error string. + + Assumes the `allocated' (and perhaps `buffer') and `translate' fields + are set in BUFP on entry. + + We call regex_compile to do the actual compilation. */ + + const char * + re_compile_pattern (pattern, length, bufp) + const char *pattern; + size_t length; + struct re_pattern_buffer *bufp; + { + reg_errcode_t ret; + + /* GNU code is written to assume at least RE_NREGS registers will be set + (and at least one extra will be -1). */ + bufp->regs_allocated = REGS_UNALLOCATED; + + /* And GNU code determines whether or not to get register information + by passing null for the REGS argument to re_match, etc., not by + setting no_sub. */ + bufp->no_sub = 0; + + /* Match anchors at newline. */ + bufp->newline_anchor = 1; + + ret = regex_compile (pattern, length, re_syntax_options, bufp); + + return re_error_msg[(int) ret]; + } + + /* Entry points compatible with 4.2 BSD regex library. We don't define + them if this is an Emacs or POSIX compilation. */ + + #if !defined (emacs) && !defined (_POSIX_SOURCE) + + /* BSD has one and only one pattern buffer. */ + static struct re_pattern_buffer re_comp_buf; + + char * + re_comp (s) + const char *s; + { + reg_errcode_t ret; + + if (!s) + { + if (!re_comp_buf.buffer) + return "No previous regular expression"; + return 0; + } + + if (!re_comp_buf.buffer) + { + re_comp_buf.buffer = (unsigned char *) malloc (200); + if (re_comp_buf.buffer == NULL) + return "Memory exhausted"; + re_comp_buf.allocated = 200; + + re_comp_buf.fastmap = (char *) malloc (1 << BYTEWIDTH); + if (re_comp_buf.fastmap == NULL) + return "Memory exhausted"; + } + + /* Since `re_exec' always passes NULL for the `regs' argument, we + don't need to initialize the pattern buffer fields which affect it. */ + + /* Match anchors at newlines. */ + re_comp_buf.newline_anchor = 1; + + ret = regex_compile (s, strlen (s), re_syntax_options, &re_comp_buf); + + /* Yes, we're discarding `const' here. */ + return (char *) re_error_msg[(int) ret]; + } + + + int + re_exec (s) + const char *s; + { + const int len = strlen (s); + return + 0 <= re_search (&re_comp_buf, s, len, 0, len, (struct re_registers *) 0); + } + #endif /* not emacs and not _POSIX_SOURCE */ + + /* POSIX.2 functions. Don't define these for Emacs. */ + + #ifndef emacs + + /* regcomp takes a regular expression as a string and compiles it. + + PREG is a regex_t *. We do not expect any fields to be initialized, + since POSIX says we shouldn't. Thus, we set + + `buffer' to the compiled pattern; + `used' to the length of the compiled pattern; + `syntax' to RE_SYNTAX_POSIX_EXTENDED if the + REG_EXTENDED bit in CFLAGS is set; otherwise, to + RE_SYNTAX_POSIX_BASIC; + `newline_anchor' to REG_NEWLINE being set in CFLAGS; + `fastmap' and `fastmap_accurate' to zero; + `re_nsub' to the number of subexpressions in PATTERN. + + PATTERN is the address of the pattern string. + + CFLAGS is a series of bits which affect compilation. + + If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we + use POSIX basic syntax. + + If REG_NEWLINE is set, then . and [^...] don't match newline. + Also, regexec will try a match beginning after every newline. + + If REG_ICASE is set, then we considers upper- and lowercase + versions of letters to be equivalent when matching. + + If REG_NOSUB is set, then when PREG is passed to regexec, that + routine will report only success or failure, and nothing about the + registers. + + It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for + the return codes and their meanings.) */ + + int + regcomp (preg, pattern, cflags) + regex_t *preg; + const char *pattern; + int cflags; + { + reg_errcode_t ret; + reg_syntax_t syntax + = (cflags & REG_EXTENDED) ? + RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC; + + /* regex_compile will allocate the space for the compiled pattern. */ + preg->buffer = 0; + preg->allocated = 0; + preg->used = 0; + + /* Don't bother to use a fastmap when searching. This simplifies the + REG_NEWLINE case: if we used a fastmap, we'd have to put all the + characters after newlines into the fastmap. This way, we just try + every character. */ + preg->fastmap = 0; + + if (cflags & REG_ICASE) + { + unsigned i; + + preg->translate = (char *) malloc (CHAR_SET_SIZE); + if (preg->translate == NULL) + return (int) REG_ESPACE; + + /* Map uppercase characters to corresponding lowercase ones. */ + for (i = 0; i < CHAR_SET_SIZE; i++) + preg->translate[i] = ISUPPER (i) ? tolower (i) : i; + } + else + preg->translate = NULL; + + /* If REG_NEWLINE is set, newlines are treated differently. */ + if (cflags & REG_NEWLINE) + { /* REG_NEWLINE implies neither . nor [^...] match newline. */ + syntax &= ~RE_DOT_NEWLINE; + syntax |= RE_HAT_LISTS_NOT_NEWLINE; + /* It also changes the matching behavior. */ + preg->newline_anchor = 1; + } + else + preg->newline_anchor = 0; + + preg->no_sub = !!(cflags & REG_NOSUB); + + /* POSIX says a null character in the pattern terminates it, so we + can use strlen here in compiling the pattern. */ + ret = regex_compile (pattern, strlen (pattern), syntax, preg); + + /* POSIX doesn't distinguish between an unmatched open-group and an + unmatched close-group: both are REG_EPAREN. */ + if (ret == REG_ERPAREN) ret = REG_EPAREN; + + return (int) ret; + } + + + /* regexec searches for a given pattern, specified by PREG, in the + string STRING. + + If NMATCH is zero or REG_NOSUB was set in the cflags argument to + `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at + least NMATCH elements, and we set them to the offsets of the + corresponding matched substrings. + + EFLAGS specifies `execution flags' which affect matching: if + REG_NOTBOL is set, then ^ does not match at the beginning of the + string; if REG_NOTEOL is set, then $ does not match at the end. + + We return 0 if we find a match and REG_NOMATCH if not. */ + + int + regexec (preg, string, nmatch, pmatch, eflags) + const regex_t *preg; + const char *string; + size_t nmatch; + regmatch_t pmatch[]; + int eflags; + { + int ret; + struct re_registers regs; + regex_t private_preg; + int len = strlen (string); + boolean want_reg_info = !preg->no_sub && nmatch > 0; + + private_preg = *preg; + + private_preg.not_bol = !!(eflags & REG_NOTBOL); + private_preg.not_eol = !!(eflags & REG_NOTEOL); + + /* The user has told us exactly how many registers to return + information about, via `nmatch'. We have to pass that on to the + matching routines. */ + private_preg.regs_allocated = REGS_FIXED; + + if (want_reg_info) + { + regs.num_regs = nmatch; + regs.start = TALLOC (nmatch, regoff_t); + regs.end = TALLOC (nmatch, regoff_t); + if (regs.start == NULL || regs.end == NULL) + return (int) REG_NOMATCH; + } + + /* Perform the searching operation. */ + ret = re_search (&private_preg, string, len, + /* start: */ 0, /* range: */ len, + want_reg_info ? ®s : (struct re_registers *) 0); + + /* Copy the register information to the POSIX structure. */ + if (want_reg_info) + { + if (ret >= 0) + { + unsigned r; + + for (r = 0; r < nmatch; r++) + { + pmatch[r].rm_so = regs.start[r]; + pmatch[r].rm_eo = regs.end[r]; + } + } + + /* If we needed the temporary register info, free the space now. */ + free (regs.start); + free (regs.end); + } + + /* We want zero return to mean success, unlike `re_search'. */ + return ret >= 0 ? (int) REG_NOERROR : (int) REG_NOMATCH; + } + + + /* Returns a message corresponding to an error code, ERRCODE, returned + from either regcomp or regexec. We don't use PREG here. */ + + size_t + regerror (errcode, preg, errbuf, errbuf_size) + int errcode; + const regex_t *preg; + char *errbuf; + size_t errbuf_size; + { + const char *msg; + size_t msg_size; + + if (errcode < 0 + || errcode >= (sizeof (re_error_msg) / sizeof (re_error_msg[0]))) + /* Only error codes returned by the rest of the code should be passed + to this routine. If we are given anything else, or if other regex + code generates an invalid error code, then the program has a bug. + Dump core so we can fix it. */ + abort (); + + msg = re_error_msg[errcode]; + + /* POSIX doesn't require that we do anything in this case, but why + not be nice. */ + if (! msg) + msg = "Success"; + + msg_size = strlen (msg) + 1; /* Includes the null. */ + + if (errbuf_size != 0) + { + if (msg_size > errbuf_size) + { + strncpy (errbuf, msg, errbuf_size - 1); + errbuf[errbuf_size - 1] = 0; + } + else + strcpy (errbuf, msg); + } + + return msg_size; + } + + + /* Free dynamically allocated space used by PREG. */ + + void + regfree (preg) + regex_t *preg; + { + if (preg->buffer != NULL) + free (preg->buffer); + preg->buffer = NULL; + + preg->allocated = 0; + preg->used = 0; + + if (preg->fastmap != NULL) + free (preg->fastmap); + preg->fastmap = NULL; + preg->fastmap_accurate = 0; + + if (preg->translate != NULL) + free (preg->translate); + preg->translate = NULL; + } + + #endif /* not emacs */ + + /* + Local variables: + make-backup-files: t + version-control: t + trim-versions-without-asking: nil + End: + */ diff -crN gawk-2.15.3/regex.h gawk-2.15.4/regex.h *** gawk-2.15.3/regex.h Wed Dec 31 19:00:00 1969 --- gawk-2.15.4/regex.h Fri Nov 26 13:37:12 1993 *************** *** 0 **** --- 1,505 ---- + /* Definitions for data structures and routines for the regular + expression library, version 0.12. + + Copyright (C) 1985, 1989, 1990, 1991, 1992, 1993 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + + #ifndef __REGEXP_LIBRARY_H__ + #define __REGEXP_LIBRARY_H__ + + /* POSIX says that must be included (by the caller) before + . */ + + #ifdef VMS + /* VMS doesn't have `size_t' in , even though POSIX says it + should be there. */ + #include + #endif + + + /* The following two types have to be signed and unsigned integer type + wide enough to hold a value of a pointer. For most ANSI compilers + ptrdiff_t and size_t should be likely OK. Still size of these two + types is 2 for Microsoft C. Ugh... */ + typedef long s_reg_t; + typedef unsigned long active_reg_t; + + /* The following bits are used to determine the regexp syntax we + recognize. The set/not-set meanings are chosen so that Emacs syntax + remains the value 0. The bits are given in alphabetical order, and + the definitions shifted by one from the previous bit; thus, when we + add or remove a bit, only one other definition need change. */ + typedef unsigned long reg_syntax_t; + + /* If this bit is not set, then \ inside a bracket expression is literal. + If set, then such a \ quotes the following character. */ + #define RE_BACKSLASH_ESCAPE_IN_LISTS (1L) + + /* If this bit is not set, then + and ? are operators, and \+ and \? are + literals. + If set, then \+ and \? are operators and + and ? are literals. */ + #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) + + /* If this bit is set, then character classes are supported. They are: + [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], + [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. + If not set, then character classes are not supported. */ + #define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) + + /* If this bit is set, then ^ and $ are always anchors (outside bracket + expressions, of course). + If this bit is not set, then it depends: + ^ is an anchor if it is at the beginning of a regular + expression or after an open-group or an alternation operator; + $ is an anchor if it is at the end of a regular expression, or + before a close-group or an alternation operator. + + This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because + POSIX draft 11.2 says that * etc. in leading positions is undefined. + We already implemented a previous draft which made those constructs + invalid, though, so we haven't changed the code back. */ + #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) + + /* If this bit is set, then special characters are always special + regardless of where they are in the pattern. + If this bit is not set, then special characters are special only in + some contexts; otherwise they are ordinary. Specifically, + * + ? and intervals are only special when not after the beginning, + open-group, or alternation operator. */ + #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) + + /* If this bit is set, then *, +, ?, and { cannot be first in an re or + immediately after an alternation or begin-group operator. */ + #define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) + + /* If this bit is set, then . matches newline. + If not set, then it doesn't. */ + #define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) + + /* If this bit is set, then . doesn't match NUL. + If not set, then it does. */ + #define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) + + /* If this bit is set, nonmatching lists [^...] do not match newline. + If not set, they do. */ + #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) + + /* If this bit is set, either \{...\} or {...} defines an + interval, depending on RE_NO_BK_BRACES. + If not set, \{, \}, {, and } are literals. */ + #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) + + /* If this bit is set, +, ? and | aren't recognized as operators. + If not set, they are. */ + #define RE_LIMITED_OPS (RE_INTERVALS << 1) + + /* If this bit is set, newline is an alternation operator. + If not set, newline is literal. */ + #define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) + + /* If this bit is set, then `{...}' defines an interval, and \{ and \} + are literals. + If not set, then `\{...\}' defines an interval. */ + #define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) + + /* If this bit is set, (...) defines a group, and \( and \) are literals. + If not set, \(...\) defines a group, and ( and ) are literals. */ + #define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) + + /* If this bit is set, then \ matches . + If not set, then \ is a back-reference. */ + #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) + + /* If this bit is set, then | is an alternation operator, and \| is literal. + If not set, then \| is an alternation operator, and | is literal. */ + #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) + + /* If this bit is set, then an ending range point collating higher + than the starting range point, as in [z-a], is invalid. + If not set, then when ending range point collates higher than the + starting range point, the range is ignored. */ + #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) + + /* If this bit is set, then an unmatched ) is ordinary. + If not set, then an unmatched ) is invalid. */ + #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) + + /* If this bit is set, do not process the GNU regex operators. + IF not set, then the GNU regex operators are recognized. */ + #define RE_NO_GNU_OPS (RE_UNMATCHED_RIGHT_PAREN_ORD << 1) + + /* This global variable defines the particular regexp syntax to use (for + some interfaces). When a regexp is compiled, the syntax used is + stored in the pattern buffer, so changing this does not affect + already-compiled regexps. */ + extern reg_syntax_t re_syntax_options; + + /* Define combinations of the above bits for the standard possibilities. + (The [[[ comments delimit what gets put into the Texinfo file, so + don't delete them!) */ + /* [[[begin syntaxes]]] */ + #define RE_SYNTAX_EMACS 0 + + #define RE_SYNTAX_AWK \ + (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ + | RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS) + + #define RE_SYNTAX_GNU_AWK \ + (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS) + + #define RE_SYNTAX_POSIX_AWK \ + (RE_SYNTAX_GNU_AWK | RE_NO_GNU_OPS) + + #define RE_SYNTAX_GREP \ + (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ + | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ + | RE_NEWLINE_ALT) + + #define RE_SYNTAX_EGREP \ + (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ + | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ + | RE_NO_BK_VBAR) + + #define RE_SYNTAX_POSIX_EGREP \ + (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES) + + /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ + #define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC + + #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC + + /* Syntax bits common to both basic and extended POSIX regex syntax. */ + #define _RE_SYNTAX_POSIX_COMMON \ + (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ + | RE_INTERVALS | RE_NO_EMPTY_RANGES) + + #define RE_SYNTAX_POSIX_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) + + /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes + RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this + isn't minimal, since other operators, such as \`, aren't disabled. */ + #define RE_SYNTAX_POSIX_MINIMAL_BASIC \ + (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) + + #define RE_SYNTAX_POSIX_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ + | RE_UNMATCHED_RIGHT_PAREN_ORD) + + /* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS + replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */ + #define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ + (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ + | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ + | RE_NO_BK_PARENS | RE_NO_BK_REFS \ + | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) + /* [[[end syntaxes]]] */ + + /* Maximum number of duplicates an interval can allow. Some systems + (erroneously) define this in other header files, but we want our + value, so remove any previous define. */ + #ifdef RE_DUP_MAX + #undef RE_DUP_MAX + #endif + /* if sizeof(int) == 2, then ((1 << 15) - 1) overflows */ + #define RE_DUP_MAX (0x7fff) + + + /* POSIX `cflags' bits (i.e., information for `regcomp'). */ + + /* If this bit is set, then use extended regular expression syntax. + If not set, then use basic regular expression syntax. */ + #define REG_EXTENDED 1 + + /* If this bit is set, then ignore case when matching. + If not set, then case is significant. */ + #define REG_ICASE (REG_EXTENDED << 1) + + /* If this bit is set, then anchors do not match at newline + characters in the string. + If not set, then anchors do match at newlines. */ + #define REG_NEWLINE (REG_ICASE << 1) + + /* If this bit is set, then report only success or fail in regexec. + If not set, then returns differ between not matching and errors. */ + #define REG_NOSUB (REG_NEWLINE << 1) + + + /* POSIX `eflags' bits (i.e., information for regexec). */ + + /* If this bit is set, then the beginning-of-line operator doesn't match + the beginning of the string (presumably because it's not the + beginning of a line). + If not set, then the beginning-of-line operator does match the + beginning of the string. */ + #define REG_NOTBOL 1 + + /* Like REG_NOTBOL, except for the end-of-line. */ + #define REG_NOTEOL (1 << 1) + + + /* If any error codes are removed, changed, or added, update the + `re_error_msg' table in regex.c. */ + typedef enum + { + REG_NOERROR = 0, /* Success. */ + REG_NOMATCH, /* Didn't find a match (for regexec). */ + + /* POSIX regcomp return error codes. (In the order listed in the + standard.) */ + REG_BADPAT, /* Invalid pattern. */ + REG_ECOLLATE, /* Not implemented. */ + REG_ECTYPE, /* Invalid character class name. */ + REG_EESCAPE, /* Trailing backslash. */ + REG_ESUBREG, /* Invalid back reference. */ + REG_EBRACK, /* Unmatched left bracket. */ + REG_EPAREN, /* Parenthesis imbalance. */ + REG_EBRACE, /* Unmatched \{. */ + REG_BADBR, /* Invalid contents of \{\}. */ + REG_ERANGE, /* Invalid range end. */ + REG_ESPACE, /* Ran out of memory. */ + REG_BADRPT, /* No preceding re for repetition op. */ + + /* Error codes we've added. */ + REG_EEND, /* Premature end. */ + REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ + REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ + } reg_errcode_t; + + /* This data structure represents a compiled pattern. Before calling + the pattern compiler, the fields `buffer', `allocated', `fastmap', + `translate', and `no_sub' can be set. After the pattern has been + compiled, the `re_nsub' field is available. All other fields are + private to the regex routines. */ + + struct re_pattern_buffer + { + /* [[[begin pattern_buffer]]] */ + /* Space that holds the compiled pattern. It is declared as + `unsigned char *' because its elements are + sometimes used as array indexes. */ + unsigned char *buffer; + + /* Number of bytes to which `buffer' points. */ + unsigned long allocated; + + /* Number of bytes actually used in `buffer'. */ + unsigned long used; + + /* Syntax setting with which the pattern was compiled. */ + reg_syntax_t syntax; + + /* Pointer to a fastmap, if any, otherwise zero. re_search uses + the fastmap, if there is one, to skip over impossible + starting points for matches. */ + char *fastmap; + + /* Either a translate table to apply to all characters before + comparing them, or zero for no translation. The translation + is applied to a pattern when it is compiled and to a string + when it is matched. */ + char *translate; + + /* Number of subexpressions found by the compiler. */ + size_t re_nsub; + + /* Zero if this pattern cannot match the empty string, one else. + Well, in truth it's used only in `re_search_2', to see + whether or not we should use the fastmap, so we don't set + this absolutely perfectly; see `re_compile_fastmap' (the + `duplicate' case). */ + unsigned can_be_null : 1; + + /* If REGS_UNALLOCATED, allocate space in the `regs' structure + for `max (RE_NREGS, re_nsub + 1)' groups. + If REGS_REALLOCATE, reallocate space if necessary. + If REGS_FIXED, use what's there. */ + #define REGS_UNALLOCATED 0 + #define REGS_REALLOCATE 1 + #define REGS_FIXED 2 + unsigned regs_allocated : 2; + + /* Set to zero when `regex_compile' compiles a pattern; set to one + by `re_compile_fastmap' if it updates the fastmap. */ + unsigned fastmap_accurate : 1; + + /* If set, `re_match_2' does not return information about + subexpressions. */ + unsigned no_sub : 1; + + /* If set, a beginning-of-line anchor doesn't match at the + beginning of the string. */ + unsigned not_bol : 1; + + /* Similarly for an end-of-line anchor. */ + unsigned not_eol : 1; + + /* If true, an anchor at a newline matches. */ + unsigned newline_anchor : 1; + + /* [[[end pattern_buffer]]] */ + }; + + typedef struct re_pattern_buffer regex_t; + + + /* search.c (search_buffer) in Emacs needs this one opcode value. It is + defined both in `regex.c' and here. */ + #define RE_EXACTN_VALUE 1 + + /* Type for byte offsets within the string. POSIX mandates this. */ + typedef int regoff_t; + + + /* This is the structure we store register match data in. See + regex.texinfo for a full description of what registers match. */ + struct re_registers + { + unsigned num_regs; + regoff_t *start; + regoff_t *end; + }; + + + /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, + `re_match_2' returns information about at least this many registers + the first time a `regs' structure is passed. */ + #ifndef RE_NREGS + #define RE_NREGS 30 + #endif + + + /* POSIX specification for registers. Aside from the different names than + `re_registers', POSIX uses an array of structures, instead of a + structure of arrays. */ + typedef struct + { + regoff_t rm_so; /* Byte offset from string's start to substring's start. */ + regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ + } regmatch_t; + + /* Declarations for routines. */ + + /* To avoid duplicating every routine declaration -- once with a + prototype (if we are ANSI), and once without (if we aren't) -- we + use the following macro to declare argument types. This + unfortunately clutters up the declarations a bit, but I think it's + worth it. */ + + #ifdef __STDC__ + + #define _RE_ARGS(args) args + + #else /* not __STDC__ */ + + #define _RE_ARGS(args) () + + #endif /* not __STDC__ */ + + /* Sets the current default syntax to SYNTAX, and return the old syntax. + You can also simply assign to the `re_syntax_options' variable. */ + extern reg_syntax_t re_set_syntax _RE_ARGS ((reg_syntax_t syntax)); + + /* Compile the regular expression PATTERN, with length LENGTH + and syntax given by the global `re_syntax_options', into the buffer + BUFFER. Return NULL if successful, and an error string if not. */ + extern const char *re_compile_pattern + _RE_ARGS ((const char *pattern, size_t length, + struct re_pattern_buffer *buffer)); + + + /* Compile a fastmap for the compiled pattern in BUFFER; used to + accelerate searches. Return 0 if successful and -2 if was an + internal error. */ + extern int re_compile_fastmap _RE_ARGS ((struct re_pattern_buffer *buffer)); + + + /* Search in the string STRING (with length LENGTH) for the pattern + compiled into BUFFER. Start searching at position START, for RANGE + characters. Return the starting position of the match, -1 for no + match, or -2 for an internal error. Also return register + information in REGS (if REGS and BUFFER->no_sub are nonzero). */ + extern int re_search + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, + int length, int start, int range, struct re_registers *regs)); + + + /* Like `re_search', but search in the concatenation of STRING1 and + STRING2. Also, stop searching at index START + STOP. */ + extern int re_search_2 + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, int range, struct re_registers *regs, int stop)); + + + /* Like `re_search', but return how many characters in STRING the regexp + in BUFFER matched, starting at position START. */ + extern int re_match + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string, + int length, int start, struct re_registers *regs)); + + + /* Relates to `re_match' as `re_search_2' relates to `re_search'. */ + extern int re_match_2 + _RE_ARGS ((struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, struct re_registers *regs, int stop)); + + + /* Set REGS to hold NUM_REGS registers, storing them in STARTS and + ENDS. Subsequent matches using BUFFER and REGS will use this memory + for recording register information. STARTS and ENDS must be + allocated with malloc, and must each be at least `NUM_REGS * sizeof + (regoff_t)' bytes long. + + If NUM_REGS == 0, then subsequent matches should allocate their own + register data. + + Unless this function is called, the first search or match using + PATTERN_BUFFER will allocate its own register data, without + freeing the old data. */ + extern void re_set_registers + _RE_ARGS ((struct re_pattern_buffer *buffer, struct re_registers *regs, + unsigned num_regs, regoff_t *starts, regoff_t *ends)); + + /* 4.2 bsd compatibility. */ + extern char *re_comp _RE_ARGS ((const char *)); + extern int re_exec _RE_ARGS ((const char *)); + + /* POSIX compatibility. */ + extern int regcomp _RE_ARGS ((regex_t *preg, const char *pattern, int cflags)); + extern int regexec + _RE_ARGS ((const regex_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int eflags)); + extern size_t regerror + _RE_ARGS ((int errcode, const regex_t *preg, char *errbuf, + size_t errbuf_size)); + extern void regfree _RE_ARGS ((regex_t *preg)); + + #endif /* not __REGEXP_LIBRARY_H__ */ + + /* + Local variables: + make-backup-files: t + version-control: t + trim-versions-without-asking: nil + End: + */ diff -crN gawk-2.15.3/test/Makefile gawk-2.15.4/test/Makefile *** gawk-2.15.3/test/Makefile Tue Nov 2 23:49:26 1993 --- gawk-2.15.4/test/Makefile Thu Jan 13 21:15:32 1994 *************** *** 28,37 **** messages:: @../gawk -f messages.awk >out2 2>out3 ! cmp out1.good out1 && cmp out2.good out2 && cmp out3.good out3 && rm -f out1 out2 out3 argarray:: ! @TEST=test ../gawk -f argarray.awk >tmp cmp argarray.good tmp && rm -f tmp fstabplus:: --- 28,37 ---- messages:: @../gawk -f messages.awk >out2 2>out3 ! { cmp out1.good out1 && cmp out2.good out2 && cmp out3.good out3 && rm -f out1 out2 out3; } || { test -c /dev/stdout && echo IT IS OK THAT THIS TEST FAILED; } argarray:: ! @TEST=test echo just a test | ../gawk -f argarray.awk argarray.awk - >tmp cmp argarray.good tmp && rm -f tmp fstabplus:: *************** *** 70,76 **** manyfiles:: @mkdir junk ! @../gawk 'BEGIN { for (i = 1; i <= 100; i++) print i, i}' >tmp @../gawk -f manyfiles.awk tmp tmp @echo "This number better be 1 ->" | tr -d '\012' @wc -l junk/* | ../gawk '$$1 != 2' | wc -l --- 70,76 ---- manyfiles:: @mkdir junk ! @../gawk 'BEGIN { for (i = 1; i <= 300; i++) print i, i}' >tmp @../gawk -f manyfiles.awk tmp tmp @echo "This number better be 1 ->" | tr -d '\012' @wc -l junk/* | ../gawk '$$1 != 2' | wc -l *************** *** 138,145 **** cmp argtest.good tmp && rm -f tmp badargtest:: ! @-../gawk -f > tmp 2>&1 cmp badargs.good tmp && rm -f tmp clean: ! rm -f tmp --- 138,145 ---- cmp argtest.good tmp && rm -f tmp badargtest:: ! @-../gawk -f 2>&1 | grep -v patchlevel > tmp cmp badargs.good tmp && rm -f tmp clean: ! rm -f tmp core diff -crN gawk-2.15.3/test/argarray.awk gawk-2.15.4/test/argarray.awk *** gawk-2.15.3/test/argarray.awk Tue Nov 2 06:16:07 1993 --- gawk-2.15.4/test/argarray.awk Wed Nov 24 10:00:17 1993 *************** *** 7,11 **** print "\t", ARGV[x] print "Environment variable TEST=" ENVIRON["TEST"] print "and the current input file is called \"" FILENAME "\"" ! print "but this would change if we would have something to process" } --- 7,14 ---- print "\t", ARGV[x] print "Environment variable TEST=" ENVIRON["TEST"] print "and the current input file is called \"" FILENAME "\"" ! } ! ! FNR == 1 { ! print "in main loop, this input file is known as \"" FILENAME "\"" } diff -crN gawk-2.15.3/test/argarray.good gawk-2.15.4/test/argarray.good *** gawk-2.15.3/test/argarray.good Tue Nov 2 06:16:08 1993 --- gawk-2.15.4/test/argarray.good Wed Nov 24 18:12:29 1993 *************** *** 1,6 **** ! here we have 1 argument ! which is gawk ! Environment variable TEST=test and the current input file is called "" ! but this would change if we would have something to process --- 1,9 ---- ! here we have 3 arguments ! which are gawk ! argarray.awk ! - ! Environment variable TEST= and the current input file is called "" ! in main loop, this input file is known as "argarray.awk" ! in main loop, this input file is known as "-" diff -crN gawk-2.15.3/test/badargs.good gawk-2.15.4/test/badargs.good *** gawk-2.15.3/test/badargs.good Sun Nov 7 11:25:11 1993 --- gawk-2.15.4/test/badargs.good Sun Dec 26 14:02:54 1993 *************** *** 1,11 **** gawk: option requires an argument -- f ! Gnu Awk (gawk) 2.15, patchlevel 3 ! Usage: gawk [POSIX or GNU style options] -f progfile [--] file ... gawk [POSIX or GNU style options] [--] 'program' file ... POSIX options: GNU long options: -f progfile --file=progfile -F fs --field-separator=fs -v var=val --assign=var=val -W compat --compat -W copyleft --copyleft -W copyright --copyright --- 1,11 ---- gawk: option requires an argument -- f ! Usage: gawk [POSIX or GNU style options] -f progfile [--] file ... gawk [POSIX or GNU style options] [--] 'program' file ... POSIX options: GNU long options: -f progfile --file=progfile -F fs --field-separator=fs -v var=val --assign=var=val + -m[fr]=val -W compat --compat -W copyleft --copyleft -W copyright --copyright diff -crN gawk-2.15.3/version.c gawk-2.15.4/version.c *** gawk-2.15.3/version.c Sun May 2 22:03:30 1993 --- gawk-2.15.4/version.c Wed Dec 29 10:37:50 1993 *************** *** 42,46 **** /* 2.14 Mostly bug fixes. */ /* 2.15 Bug fixes plus intermixing of command-line source and files, ! GNU long options, ARGIND, ERRNO and Plan 9 style /dev/ files. */ --- 42,47 ---- /* 2.14 Mostly bug fixes. */ /* 2.15 Bug fixes plus intermixing of command-line source and files, ! GNU long options, ARGIND, ERRNO and Plan 9 style /dev/ files. ! `delete array'. OS/2 port added. */ diff -crN gawk-2.15.3/vms/gawk.cld gawk-2.15.4/vms/gawk.cld *** gawk-2.15.3/vms/gawk.cld Sat Jun 1 11:33:34 1991 --- gawk-2.15.4/vms/gawk.cld Fri Dec 17 08:15:35 1993 *************** *** 9,15 **** qualifier input, value(required,list,type=$infile), label=progfile qualifier commands, value(required), label=program qualifier field_separator, value(required), label=field_sep ! qualifier reg_expr, value(type=reg_expr_keywords) qualifier variables, value(required,list) qualifier copyright qualifier version --- 9,15 ---- qualifier input, value(required,list,type=$infile), label=progfile qualifier commands, value(required), label=program qualifier field_separator, value(required), label=field_sep ! qualifier reg_expr, value(type=reg_expr_keywords) !(OBSOLETE) qualifier variables, value(required,list) qualifier copyright qualifier version diff -crN gawk-2.15.3/vms/vms.h gawk-2.15.4/vms/vms.h *** gawk-2.15.3/vms/vms.h Tue Aug 4 21:25:33 1992 --- gawk-2.15.4/vms/vms.h Tue Jan 4 16:18:24 1994 *************** *** 50,56 **** #define vmswork(sts) ((sts)&1) #define vmsfail(sts) (!vmswork(sts)) #define CondVal(sts) ((sts)&0x0FFFFFF8) /* strip severity & msg inhibit */ ! #define Descrip(strdsc,strbuf) Dsc strdsc = {sizeof strbuf - 1, strbuf} extern int shell$is_shell P((void)); extern u_long LIB$FIND_FILE P((const Dsc *, Dsc *, void *, ...)); --- 50,56 ---- #define vmswork(sts) ((sts)&1) #define vmsfail(sts) (!vmswork(sts)) #define CondVal(sts) ((sts)&0x0FFFFFF8) /* strip severity & msg inhibit */ ! #define Descrip(strdsc,strbuf) Dsc strdsc = {sizeof strbuf - 1, (char *)strbuf} extern int shell$is_shell P((void)); extern u_long LIB$FIND_FILE P((const Dsc *, Dsc *, void *, ...)); diff -crN gawk-2.15.3/vms/vms_args.c gawk-2.15.4/vms/vms_args.c *** gawk-2.15.3/vms/vms_args.c Wed Sep 9 21:09:45 1992 --- gawk-2.15.4/vms/vms_args.c Tue Jan 4 16:18:33 1994 *************** *** 4,10 **** */ /* ! * Copyright (C) 1991 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 4,10 ---- */ /* ! * Copyright (C) 1991-1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 97,104 **** void vms_arg_fixup( int *pargc, char ***pargv ) { ! char *f_in, *f_out, *f_err, ! *out_mode, *rms_opt1, *rms_opt2, *rms_opt3, *rms_opt4; char **argv = *pargv; int i, argc = *pargc; int err_to_out_redirect = 0, out_to_err_redirect = 0; --- 97,104 ---- void vms_arg_fixup( int *pargc, char ***pargv ) { ! const char *f_in, *f_out, *f_err, ! *out_mode, *rms_rfm, *rms_shr, *rms_mrs; char **argv = *pargv; int i, argc = *pargc; int err_to_out_redirect = 0, out_to_err_redirect = 0; *************** *** 120,128 **** f_in = f_out = f_err = NULL; /* stdio setup (no filenames yet) */ out_mode = "w"; /* default access for stdout */ ! rms_opt1 = rms_opt2 = "ctx=stm"; /* ("context = stream") == no-opt */ ! rms_opt3 = "shr=nil"; /* no sharing (for '>' output file) */ ! rms_opt4 = "mrs=0"; /* maximum record size */ for (i = 1; i < argc; i++) { char *p, *fn; --- 120,128 ---- f_in = f_out = f_err = NULL; /* stdio setup (no filenames yet) */ out_mode = "w"; /* default access for stdout */ ! rms_rfm = "rfm=stmlf"; /* stream_LF format */ ! rms_shr = "shr=nil"; /* no sharing (for '>' output file) */ ! rms_mrs = "mrs=0"; /* no maximum record size */ for (i = 1; i < argc; i++) { char *p, *fn; *************** *** 156,163 **** else if (*p == '&') /* '>&' => stderr */ is_out = 0, p++; else if (*p == '$') /* '>$' => kludge for record format */ ! rms_opt1 = "rfm=var", rms_opt2 = "rat=cr", ! rms_opt3 = "shr=get", rms_opt4 = "mrs=32767", p++; else /* '>' => create */ {} /* use default values initialized prior to loop */ p = skipblanks(p); --- 156,163 ---- else if (*p == '&') /* '>&' => stderr */ is_out = 0, p++; else if (*p == '$') /* '>$' => kludge for record format */ ! rms_rfm = "rfm=var", rms_shr = "shr=get,upi", ! rms_mrs = "mrs=32767", p++; else /* '>' => create */ {} /* use default values initialized prior to loop */ p = skipblanks(p); *************** *** 254,260 **** if (f_in) { /* [re]open file and define logical name */ stdin = freopen(f_in, "r", stdin, "ctx=rec", "shr=get,put,del,upd", ! "mrs=32767", "mbc=24", "mbf=2"); if (stdin != NULL) (void) vms_define("SYS$INPUT", f_in); else --- 254,260 ---- if (f_in) { /* [re]open file and define logical name */ stdin = freopen(f_in, "r", stdin, "ctx=rec", "shr=get,put,del,upd", ! "mrs=32767", "mbc=32", "mbf=2"); if (stdin != NULL) (void) vms_define("SYS$INPUT", f_in); else *************** *** 262,269 **** } if (f_out) { stdout = freopen(f_out, out_mode, stdout, ! rms_opt1, rms_opt2, rms_opt3, rms_opt4, ! "mbc=24", "mbf=2"); if (stdout != NULL) (void) vms_define("SYS$OUTPUT", f_out); else --- 262,269 ---- } if (f_out) { stdout = freopen(f_out, out_mode, stdout, ! rms_rfm, rms_shr, rms_mrs, ! "rat=cr", "mbc=32", "mbf=2"); if (stdout != NULL) (void) vms_define("SYS$OUTPUT", f_out); else *************** *** 377,395 **** { Dsc log_dsc; static Descrip(lnmtable,"LNM$PROCESS_TABLE"); ! static long attr = LNM$M_CONFINE; ! static Itm itemlist[] = { {sizeof attr,LNM$_ATTRIBUTES,&attr,0}, ! {0,LNM$_STRING,0,0}, {0,0} }; static unsigned char acmode = PSL$C_USER; /* avoid "define SYS$OUTPUT sys$output:" for redundant ">sys$output:" */ ! if (strncasecmp(log_name, trans_val, strlen(log_name)) == 0) return 0; ! log_dsc.len = strlen(log_dsc.adr = (char *)log_name); ! itemlist[1].buffer = (char *)trans_val; ! itemlist[1].len = strlen(trans_val); ! return SYS$CRELNM((u_long *)0, &lnmtable, &log_dsc, &acmode, itemlist); } /* t_strstr -- strstr() substitute; search 'str' for 'sub' */ --- 377,397 ---- { Dsc log_dsc; static Descrip(lnmtable,"LNM$PROCESS_TABLE"); ! static u_long attr = LNM$M_CONFINE; ! static Itm itemlist[] = { {0,LNM$_STRING,0,0}, {0,0} }; static unsigned char acmode = PSL$C_USER; + unsigned len = strlen(log_name); /* avoid "define SYS$OUTPUT sys$output:" for redundant ">sys$output:" */ ! if (strncasecmp(log_name, trans_val, len) == 0 ! && (trans_val[len] == '\0' || trans_val[len] == ':')) return 0; ! log_dsc.adr = (char *)log_name; ! log_dsc.len = len; ! itemlist[0].buffer = (char *)trans_val; ! itemlist[0].len = strlen(trans_val); ! return SYS$CRELNM(&attr, &lnmtable, &log_dsc, &acmode, itemlist); } /* t_strstr -- strstr() substitute; search 'str' for 'sub' */ diff -crN gawk-2.15.3/vms/vms_fwrite.c gawk-2.15.4/vms/vms_fwrite.c *** gawk-2.15.3/vms/vms_fwrite.c Tue Aug 4 21:25:34 1992 --- gawk-2.15.4/vms/vms_fwrite.c Wed Dec 29 10:41:29 1993 *************** *** 3,9 **** */ /* ! * Copyright (C) 1991 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1991-1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. diff -crN gawk-2.15.3/vms/vms_gawk.c gawk-2.15.4/vms/vms_gawk.c *** gawk-2.15.3/vms/vms_gawk.c Thu Oct 21 22:51:42 1993 --- gawk-2.15.4/vms/vms_gawk.c Tue Jan 4 16:18:25 1994 *************** *** 3,9 **** */ /* ! * Copyright (C) 1991 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1991-1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 101,106 **** --- 101,107 ---- misc_argp = misc_args; *misc_argp++ = '-'; /* now points at &misc_args[1] */ + #if 0 /* as of 2.12, -a and -e are obsolete */ if (Present("REG_EXPR")) { if (Present("REG_EXPR.AWK")) /* /reg_exp=awk -> -a */ *misc_argp++ = 'a'; *************** *** 108,113 **** --- 109,115 ---- || Present("REG_EXPR.POSIX")) /* /reg_exp=posix -> -e */ *misc_argp++ = 'e'; } + #endif /* 0 */ #if 0 /* gawk 2.11.1 */ if (Present("STRICT")) /* /strict -> -c */ *misc_argp++ = 'c'; *************** *** 179,202 **** static int /* note: doesn't return anything; allows 'return vms_usage()' */ vms_usage( int complaint ) { ! static char ! *usage_txt = "\n\ usage: %s /COMMANDS=\"awk program text\" data_file[,data_file,...] \n\ or %s /INPUT=awk_file data_file[,\"Var=value\",data_file,...] \n\ or %s /INPUT=(awk_file1,awk_file2,...) data_file[,...] \n\ ! ", *options_txt = "\n\ options: /FIELD_SEPARATOR=\"FS_value\" \n\ - /VARIABLES=(\"Var1=value1\",\"Var2=value2\",...) \n\ - - /REG_EXPR= AWK or EGREP or POSIX \n\ - /LINT /POSIX /[NO]STRICT /VERSION /COPYRIGHT /USAGE \n\ - /OUTPUT=out_file \n\ ! ", *no_prog = "missing required element: /COMMANDS or /INPUT", ! *no_file = "missing required element: data_file \n\ (use \"SYS$INPUT:\" to read data lines from the terminal)", ! *bad_combo = "invalid combination of qualifiers \n\ (/INPUT=awk_file and /COMMANDS=\"awk program\" are mutually exclusive)", ! *run_used = "\"RUN\" was used; required command components missing"; ! int status, argc; fflush(stdout); switch (complaint) { --- 181,205 ---- static int /* note: doesn't return anything; allows 'return vms_usage()' */ vms_usage( int complaint ) { ! static const char ! *usage_txt = "\n\ usage: %s /COMMANDS=\"awk program text\" data_file[,data_file,...] \n\ or %s /INPUT=awk_file data_file[,\"Var=value\",data_file,...] \n\ or %s /INPUT=(awk_file1,awk_file2,...) data_file[,...] \n\ ! ", ! *options_txt = "\n\ options: /FIELD_SEPARATOR=\"FS_value\" \n\ - /VARIABLES=(\"Var1=value1\",\"Var2=value2\",...) \n\ - /LINT /POSIX /[NO]STRICT /VERSION /COPYRIGHT /USAGE \n\ - /OUTPUT=out_file \n\ ! ", ! *no_prog = "missing required element: /COMMANDS or /INPUT", ! *no_file = "missing required element: data_file \n\ (use \"SYS$INPUT:\" to read data lines from the terminal)", ! *bad_combo = "invalid combination of qualifiers \n\ (/INPUT=awk_file and /COMMANDS=\"awk program\" are mutually exclusive)", ! *run_used = "\"RUN\" was used; required command components missing"; ! int status, argc; fflush(stdout); switch (complaint) { diff -crN gawk-2.15.3/vms/vms_misc.c gawk-2.15.4/vms/vms_misc.c *** gawk-2.15.3/vms/vms_misc.c Thu Nov 4 06:25:51 1993 --- gawk-2.15.4/vms/vms_misc.c Wed Dec 29 10:41:57 1993 *************** *** 3,9 **** */ /* ! * Copyright (C) 1991 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1991-1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. *************** *** 30,35 **** --- 30,36 ---- #ifndef O_RDONLY #include #endif + #include #include #include *************** *** 114,121 **** int vms_open( const char *name, int mode, ... ) { if (mode == (O_WRONLY|O_CREAT|O_TRUNC)) ! return creat(name, 0, "shr=nil", "mbc=24"); else { struct stat stb; const char *mbc, *shr = "shr=get"; --- 115,124 ---- int vms_open( const char *name, int mode, ... ) { + int result; + if (mode == (O_WRONLY|O_CREAT|O_TRUNC)) ! result = creat(name, 0, "shr=nil", "mbc=32"); else { struct stat stb; const char *mbc, *shr = "shr=get"; *************** *** 123,133 **** if (stat((char *)name, &stb) < 0) { /* assume DECnet */ mbc = "mbc=8"; } else { /* ordinary file; allow full sharing iff record format */ ! mbc = "mbc=12"; if (stb.st_fab_rfm < FAB$C_STM) shr = "shr=get,put,upd"; } ! return open(name, mode, 0, shr, mbc, "mbf=2"); } } /* --- 126,144 ---- if (stat((char *)name, &stb) < 0) { /* assume DECnet */ mbc = "mbc=8"; } else { /* ordinary file; allow full sharing iff record format */ ! mbc = "mbc=32"; if (stb.st_fab_rfm < FAB$C_STM) shr = "shr=get,put,upd"; } ! result = open(name, mode, 0, shr, mbc, "mbf=2"); } + + /* This is only approximate; the ACP -> RMS -> VAXCRTL interface + discards too much potentially useful status information... */ + if (result < 0 && errno == EVMSERR + && (vaxc$errno == RMS$_ACC || vaxc$errno == RMS$_CRE)) + errno = EMFILE; /* redirect() should close 1 file & try again */ + + return result; } /* diff -crN gawk-2.15.3/vms/vms_popen.c gawk-2.15.4/vms/vms_popen.c *** gawk-2.15.3/vms/vms_popen.c Tue Aug 4 21:25:35 1992 --- gawk-2.15.4/vms/vms_popen.c Wed Dec 29 10:42:07 1993 *************** *** 3,9 **** */ /* ! * Copyright (C) 1991 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. --- 3,9 ---- */ /* ! * Copyright (C) 1991-1993 the Free Software Foundation, Inc. * * This file is part of GAWK, the GNU implementation of the * AWK Progamming Language. diff -crN gawk-2.15.3/vms/vmsbuild.com gawk-2.15.4/vms/vmsbuild.com *** gawk-2.15.3/vms/vmsbuild.com Thu Nov 4 06:25:52 1993 --- gawk-2.15.4/vms/vmsbuild.com Wed Dec 29 10:42:58 1993 *************** *** 5,11 **** $! gawk 2.15 revised, Oct'93 $! $ REL = "2.15" !release version number ! $ PATCHLVL = "3" $! $! [ remove "/optimize=noinline" for VAX C V2.x or DEC C ] $! [ add "/standard=VAXC" for DEC C and "/g_float" for Alpha ] --- 5,11 ---- $! gawk 2.15 revised, Oct'93 $! $ REL = "2.15" !release version number ! $ PATCHLVL = "4" $! $! [ remove "/optimize=noinline" for VAX C V2.x or DEC C ] $! [ add "/standard=VAXC" for DEC C and "/g_float" for Alpha ] EOF