minix/commands/ash/bltin/regexp.c
2006-05-23 12:59:34 +00:00

301 lines
5 KiB
C
Executable file

/*
* Regular expression matching for expr(1). Bugs: The upper bound of
* a range specified by the \{ feature cannot be zero.
*
* Copyright (C) 1989 by Kenneth Almquist. All rights reserved.
* This file is part of ash, which is distributed under the terms specified
* by the Ash General Public License. See the file named LICENSE.
*/
#include "bltin.h"
#include "myregexp.h"
#include <stdlib.h>
#define RE_END 0 /* end of regular expression */
#define RE_LITERAL 1 /* normal character follows */
#define RE_DOT 2 /* "." */
#define RE_CCL 3 /* "[...]" */
#define RE_NCCL 4 /* "[^...]" */
#define RE_LP 5 /* "\(" */
#define RE_RP 6 /* "\)" */
#define RE_MATCHED 7 /* "\digit" */
#define RE_EOS 8 /* "$" matches end of string */
#define RE_STAR 9 /* "*" */
#define RE_RANGE 10 /* "\{num,num\}" */
char *match_begin[10];
short match_length[10];
short number_parens;
static int match(char *pattern, char *string);
char *
re_compile(pattern)
char *pattern;
{
register char *p;
register char c;
char *comp;
register char *q;
char *begin;
char *endp;
register int len;
int first;
int type;
char *stackp;
char stack[10];
int paren_num;
int i;
p = pattern;
if (*p == '^')
p++;
comp = q = malloc(2 * strlen(p) + 1);
begin = q;
stackp = stack;
paren_num = 0;
for (;;) {
switch (c = *p++) {
case '\0':
*q = '\0';
goto out;
case '.':
*q++ = RE_DOT;
len = 1;
break;
case '[':
begin = q;
*q = RE_CCL;
if (*p == '^') {
*q = RE_NCCL;
p++;
}
q++;
first = 1;
while (*p != ']' || first == 1) {
if (p[1] == '-' && p[2] != ']') {
*q++ = '-';
*q++ = p[0];
*q++ = p[2];
p += 3;
} else if (*p == '-') {
*q++ = '-';
*q++ = '-';
*q++ = '-';
p++;
} else {
*q++ = *p++;
}
first = 0;
}
p++;
*q++ = '\0';
len = q - begin;
break;
case '$':
if (*p != '\0')
goto dft;
*q++ = RE_EOS;
break;
case '*':
if (len == 0)
goto dft;
type = RE_STAR;
range:
i = (type == RE_RANGE)? 3 : 1;
endp = q + i;
begin = q - len;
do {
--q;
*(q + i) = *q;
} while (--len > 0);
q = begin;
*q++ = type;
if (type == RE_RANGE) {
i = 0;
while ((unsigned)(*p - '0') <= 9)
i = 10 * i + (*p++ - '0');
*q++ = i;
if (*p != ',') {
*q++ = i;
} else {
p++;
i = 0;
while ((unsigned)(*p - '0') <= 9)
i = 10 * i + (*p++ - '0');
*q++ = i;
}
if (*p != '\\' || *++p != '}')
error("RE error");
p++;
}
q = endp;
break;
case '\\':
if ((c = *p++) == '(') {
if (++paren_num > 9)
error("RE error");
*q++ = RE_LP;
*q++ = paren_num;
*stackp++ = paren_num;
len = 0;
} else if (c == ')') {
if (stackp == stack)
error("RE error");
*q++ = RE_RP;
*q++ = *--stackp;
len = 0;
} else if (c == '{') {
type = RE_RANGE;
goto range;
} else if ((unsigned)(c - '1') < 9) {
/* should check validity here */
*q++ = RE_MATCHED;
*q++ = c - '0';
len = 2;
} else {
goto dft;
}
break;
default:
dft: *q++ = RE_LITERAL;
*q++ = c;
len = 2;
break;
}
}
out:
if (stackp != stack)
error("RE error");
number_parens = paren_num;
return comp;
}
int
re_match(pattern, string)
char *pattern;
char *string;
{
char **pp;
match_begin[0] = string;
for (pp = &match_begin[1] ; pp <= &match_begin[9] ; pp++)
*pp = 0;
return match(pattern, string);
}
static
match(pattern, string)
char *pattern;
char *string;
{
register char *p, *q;
int counting;
int low, high, count;
char *curpat;
char *start_count;
int negate;
int found;
char *r;
int len;
char c;
p = pattern;
q = string;
counting = 0;
for (;;) {
if (counting) {
if (++count > high)
goto bad;
p = curpat;
}
switch (*p++) {
case RE_END:
match_length[0] = q - match_begin[0];
return 1;
case RE_LITERAL:
if (*q++ != *p++)
goto bad;
break;
case RE_DOT:
if (*q++ == '\0')
goto bad;
break;
case RE_CCL:
negate = 0;
goto ccl;
case RE_NCCL:
negate = 1;
ccl:
found = 0;
c = *q++;
while (*p) {
if (*p == '-') {
if (c >= *++p && c <= *++p)
found = 1;
} else {
if (c == *p)
found = 1;
}
p++;
}
p++;
if (found == negate)
goto bad;
break;
case RE_LP:
match_begin[*p++] = q;
break;
case RE_RP:
match_length[*p] = q - match_begin[*p];
p++;
break;
case RE_MATCHED:
r = match_begin[*p];
len = match_length[*p++];
while (--len >= 0) {
if (*q++ != *r++)
goto bad;
}
break;
case RE_EOS:
if (*q != '\0')
goto bad;
break;
case RE_STAR:
low = 0;
high = 32767;
goto range;
case RE_RANGE:
low = *p++;
high = *p++;
if (high == 0)
high = 32767;
range:
curpat = p;
start_count = q;
count = 0;
counting++;
break;
}
}
bad:
if (! counting)
return 0;
len = 1;
if (*curpat == RE_MATCHED)
len = match_length[curpat[1]];
while (--count >= low) {
if (match(p, start_count + count * len))
return 1;
}
return 0;
}