minix/commands/sort/sort.c

1192 lines
32 KiB
C
Raw Normal View History

2005-04-21 16:53:53 +02:00
/* sort - sort a file of lines Author: Michiel Huisjes */
/* SYNOPSIS:
* sort [-funbirdcmt'x'] [+beg_pos[opts] [-end_pos]] [-o outfile] [file]..
*
* [opts] can be any of
* -f : Fold upper case to lower.
* -n : Sort to numeric value (optional decimal point) implies -b
* -b : Skip leading blanks
* -i : Ignore chars outside ASCII range (040 - 0176)
* -r : Reverse the sense of comparisons.
* -d : Sort to dictionary order. Only letters, digits, comma's and points
* are compared.
* If any of these flags are used in [opts], then they override all global
* ordering for this field.
*
* I/O control flags are:
* -u : Print uniq lines only once.
* -c : Check if files are sorted in order.
* -m : Merge already sorted files.
* -o outfile : Name of output file. (Can be one of the input files).
* Default is stdout.
* - : Take stdin as input.
*
* Fields:
* -t'x' : Field separating character is 'x'
* +a.b : Start comparing at field 'a' with offset 'b'. A missing 'b' is
* taken to be 0.
* -a.b : Stop comparing at field 'a' with offset 'b'. A missing 'b' is
* taken to be 0.
* A missing -a.b means the rest of the line.
*/
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <signal.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <limits.h>
#define OPEN_FILES (OPEN_MAX-4) /* Nr of open files per process */
#if __minix_vmd
#define MEMORY_SIZE (1024 * 1024)
#else
#define MEMORY_SIZE ((10 * sizeof(int)) * 1024)
#endif
/* Total mem_size */
#define LINE_SIZE (1024 >> 1) /* Max length of a line */
#define IO_SIZE (2 * 1024) /* Size of buffered output */
#define STD_OUT 1 /* Fd of terminal */
/* Return status of functions */
#define OK 0
#define ERROR -1
/* Compare return values */
#define LOWER -1
#define SAME 0
#define HIGHER 1
/* Table definitions. */
#define DICT 0x001 /* Alpha, numeric, letters and . */
#define ASCII 0x002 /* All between ' ' and '~' */
#define BLANK 0x004 /* ' ' and '\t' */
#define DIGIT 0x008 /* 0-9 */
#define UPPER 0x010 /* A-Z */
typedef int BOOL;
#define FALSE 0
#define TRUE 1
typedef struct {
int fd; /* Fd of file */
char *buffer; /* Buffer for reads */
int read_chars; /* Nr of chars actually read in buffer */
int cnt; /* Nr of chars taken out of buffer */
char *line; /* Contains line currently used */
} MERGE;
MERGE merge_f[OPEN_FILES]; /* Merge structs */
int buf_size; /* Size of core available for each struct */
#define FIELDS_LIMIT 10 /* 1 global + 9 user */
#define GLOBAL 0
typedef struct {
int beg_field, beg_pos; /* Begin field + offset */
int end_field, end_pos; /* End field + offset. ERROR == EOLN */
BOOL reverse; /* TRUE if rev. flag set on this field */
BOOL blanks;
BOOL dictionary;
BOOL fold_case;
BOOL ascii;
BOOL numeric;
} FIELD;
/* Field declarations. A total of FILEDS_LIMIT is allowed */
FIELD fields[FIELDS_LIMIT];
int field_cnt; /* Nr of field actually assigned */
/* Various output control flags */
BOOL check = FALSE;
BOOL only_merge = FALSE;
BOOL uniq = FALSE;
char *mem_top; /* Mem_top points to lowest pos of memory. */
char *cur_pos; /* First free position in mem */
char **line_table; /* Pointer to the internal line table */
BOOL in_core = TRUE; /* Set if input cannot all be sorted in core */
/* Place where temp_files should be made */
char temp_files[] = "/tmp/sort.XXXXX.XX";
char *output_file; /* Name of output file */
int out_fd; /* Fd to output file (could be STD_OUT) */
char out_buffer[IO_SIZE]; /* For buffered output */
char **argptr; /* Pointer to argv structure */
int args_offset; /* Nr of args spilled on options */
int args_limit; /* Nr of args given */
char separator; /* Char that separates fields */
int nr_of_files = 0; /* Nr_of_files to be merged */
int disabled; /* Nr of files done */
char USAGE[] = "Usage: sort [-funbirdcmt'x'] [+beg_pos [-end_pos]] [-o outfile] [file] ..";
/* Forward declarations */
int main(int argc, char **argv);
void get_opts(char *ptr, FIELD * field);
void new_field(FIELD * field, int *offset, BOOL beg_fl);
void adjust_options(FIELD * field);
void error(BOOL quit, char *message, char *arg);
void open_outfile(void);
void get_file(int fd, off_t size);
int last_line(void);
void print_table(int fd);
char *file_name(int nr);
void mread(int fd, char *address, int bytes);
void mwrite(int fd, char *address, int bytes);
void sort(void);
void sort_table(int nel);
void incr(int si, int ei);
int cmp_fields(char *el1, char *el2);
void build_field(char *dest, FIELD * field, char *src);
char *skip_fields(char *str, int nf);
int compare(char *el1, char *el2);
int cmp(unsigned char *el1, unsigned char *el2, FIELD * field);
int digits(char *str1, char *str2, BOOL check_sign);
void files_merge(int file_cnt);
void merge(int start_file, int limit_file);
void put_line(char *line);
MERGE * print(MERGE * merg, int file_cnt);
int read_line(MERGE * merg);
MERGE * skip_lines(MERGE * smallest, int file_cnt);
void uniq_lines(MERGE * merg);
void check_file(int fd, char *file);
int length(char *line);
void copy(char *dest, char *src);
char *msbrk(int size);
void mbrk(char *address);
void catch(int dummy);
2005-04-21 16:53:53 +02:00
/* Table of all chars. 0 means no special meaning. */
char table[256] = {
/* '^@' to space */
0, 0, 0, 0, 0, 0, 0, 0,
0, BLANK | DICT, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* Space to '0' */
BLANK | DICT | ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,
ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,
ASCII, ASCII,
/* '0' until '9' */
DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII,
DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII,
DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII,
DIGIT | DICT | ASCII,
/* ASCII from ':' to '@' */
ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,
/* Upper case letters 'A' to 'Z' */
UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
UPPER | DICT | ASCII, UPPER | DICT | ASCII,
/* ASCII from '[' to '`' */
ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,
/* Lower case letters from 'a' to 'z' */
DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII,
DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII,
DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII,
DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII,
DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII,
DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII,
DICT | ASCII, DICT | ASCII,
/* ASCII from '{' to '~' */
ASCII, ASCII, ASCII, ASCII,
/* Stuff from -1 to -177 */
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0
};
/*
* Get_opts () assigns the options into the field structure as described in ptr.
* This field structure could be the GLOBAL one.
*/
void get_opts(ptr, field)
register char *ptr;
register FIELD *field;
{
switch (*ptr) {
case 'b': /* Skip leading blanks */
field->blanks = TRUE;
break;
case 'd': /* Dictionary order */
field->dictionary = TRUE;
break;
case 'f': /* Fold upper case to lower */
field->fold_case = TRUE;
break;
case 'i': /* Skip chars outside ' ' '~' */
field->ascii = TRUE;
break;
case 'n': /* Sort on numeric */
field->numeric = TRUE;
field->blanks = TRUE;
break;
case 'r': /* Reverse comparisons */
field->reverse = TRUE;
break;
default: /* Illegal options */
error(TRUE, USAGE, NULL);
2005-04-21 16:53:53 +02:00
}
}
/* New_field () assigns a new field as described by the arguments.
* A field description is of the form: +a.b[opts] -c.d, where b and d, as well
* as -c.d and [opts] are optional. Nr before digit is field nr. Nr after digit
* is offset from field.
*/
void new_field(field, offset, beg_fl)
register FIELD *field; /* Field to assign */
int *offset; /* Offset in argv structure */
BOOL beg_fl; /* Assign beg or end of field */
{
register char *ptr;
ptr = argptr[*offset];
*offset += 1; /* Incr offset to next arg */
ptr++;
if (beg_fl)
field->beg_field = atoi(ptr); /* Assign int of first field */
else
field->end_field = atoi(ptr);
while (table[*ptr] & DIGIT) /* Skip all digits */
ptr++;
if (*ptr == '.') { /* Check for offset */
ptr++;
if (beg_fl)
field->beg_pos = atoi(ptr);
else
field->end_pos = atoi(ptr);
while (table[*ptr] & DIGIT) /* Skip digits */
ptr++;
}
if (beg_fl) {
while (*ptr != '\0') /* Check options after field */
get_opts(ptr++, field);
}
if (beg_fl) { /* Check for end pos */
ptr = argptr[*offset];
if (ptr && *ptr == '-' && ((table[*(ptr + 1)] & DIGIT) || *(ptr + 1) == '.')) {
new_field(field, offset, FALSE);
if (field->beg_field > field->end_field)
error(TRUE, "End field is before start field!", NULL);
2005-04-21 16:53:53 +02:00
} else /* No end pos. */
field->end_field = ERROR;
}
}
int main(argc, argv)
int argc;
char *argv[];
{
int arg_count = 1; /* Offset in argv */
struct stat st;
register char *ptr; /* Ptr to *argv in use */
register int fd;
int pid, pow;
argptr = argv;
cur_pos = mem_top = msbrk(MEMORY_SIZE); /* Find lowest mem. location */
while (arg_count < argc && ((ptr = argv[arg_count])[0] == '-' || *ptr == '+')) {
if (*ptr == '-' && *(ptr + 1) == '\0') /* "-" means stdin */
break;
if (*ptr == '+') { /* Assign field. */
if (++field_cnt == FIELDS_LIMIT)
error(TRUE, "Too many fields", NULL);
2005-04-21 16:53:53 +02:00
new_field(&fields[field_cnt], &arg_count, TRUE);
} else { /* Get output options */
while (*++ptr) {
switch (*ptr) {
case 'c': /* Only check file */
check = TRUE;
break;
case 'm': /* Merge (sorted) files */
only_merge = TRUE;
break;
case 'u': /* Only give uniq lines */
uniq = TRUE;
break;
case 'o': /* Name of output file */
output_file = argv[++arg_count];
break;
case 't': /* Field separator */
ptr++;
separator = *ptr;
break;
default: /* Sort options */
get_opts(ptr, &fields[GLOBAL]);
}
}
arg_count++;
}
}
for (fd = 1; fd <= field_cnt; fd++) adjust_options(&fields[fd]);
/* Create name of tem_files 'sort.pid.aa' */
ptr = &temp_files[10];
pid = getpid();
pow = 10000;
while (pow != 0) {
*ptr++ = pid / pow + '0';
pid %= pow;
pow /= 10;
}
signal(SIGINT, catch);
/* Only merge files. Set up */
if (only_merge) {
args_limit = args_offset = arg_count;
while (argv[args_limit] != NULL)
2005-04-21 16:53:53 +02:00
args_limit++; /* Find nr of args */
files_merge(args_limit - arg_count);
exit(0);
}
if (arg_count == argc) { /* No args left. Use stdin */
if (check)
check_file(0, NULL);
2005-04-21 16:53:53 +02:00
else
get_file(0, (off_t) 0);
} else
while (arg_count < argc) { /* Sort or check args */
if (strcmp(argv[arg_count], "-") == 0)
fd = 0;
else if (stat(argv[arg_count], &st) < 0) {
error(FALSE, "Cannot find ", argv[arg_count++]);
continue;
}
/* Open files */
else if ((fd = open(argv[arg_count], O_RDONLY)) < 0) {
error(FALSE, "Cannot open ", argv[arg_count++]);
continue;
}
if (check)
check_file(fd, argv[arg_count]);
else /* Get_file reads whole file */
get_file(fd, st.st_size);
arg_count++;
}
if (check) exit(0);
sort(); /* Sort whatever is left */
if (nr_of_files == 1) /* Only one file sorted -> don't merge */
exit(0);
files_merge(nr_of_files);
return(0);
}
/* Adjust_options() assigns all global variables set also in the fields
* assigned.
*/
void adjust_options(field)
register FIELD *field;
{
register FIELD *gfield = &fields[GLOBAL];
if (gfield->reverse) field->reverse = TRUE;
if (gfield->blanks) field->blanks = TRUE;
if (gfield->dictionary) field->dictionary = TRUE;
if (gfield->fold_case) field->fold_case = TRUE;
if (gfield->ascii) field->ascii = TRUE;
if (gfield->numeric) field->numeric = TRUE;
}
/* Error () prints the error message on stderr and exits if quit == TRUE. */
void error(quit, message, arg)
register BOOL quit;
register char *message, *arg;
{
write(2, message, strlen(message));
if (arg != NULL) write(2, arg, strlen(arg));
2005-04-21 16:53:53 +02:00
perror(" ");
if (quit) exit(1);
}
/* Open_outfile () assigns to out_fd the fd where the output must go when all
* the sorting is done.
*/
void open_outfile()
{
if (output_file == NULL)
2005-04-21 16:53:53 +02:00
out_fd = STD_OUT;
else if ((out_fd = creat(output_file, 0644)) < 0)
error(TRUE, "Cannot creat ", output_file);
}
/* Get_file reads the whole file of filedescriptor fd. If the file is too big
* to keep in core, a partial sort is done, and the output is stashed somewhere.
*/
void get_file(fd, size)
int fd; /* Fd of file to read */
register off_t size; /* Size of file */
{
register int i;
int rest; /* Rest in memory */
char save_ch; /* Used in stdin readings */
rest = MEMORY_SIZE - (cur_pos - mem_top);
if (fd == 0) { /* We're reding stdin */
while ((i = read(0, cur_pos, rest)) > 0) {
if ((cur_pos - mem_top) + i == MEMORY_SIZE) {
in_core = FALSE;
i = last_line(); /* End of last line */
save_ch = mem_top[i];
mem_top[i] = '\0';
sort(); /* Sort core */
mem_top[i] = save_ch; /* Restore erased char */
/* Restore last (half read) line */
for (rest = 0; i + rest != MEMORY_SIZE; rest++)
mem_top[rest] = mem_top[i + rest];
/* Assign current pos. in memory */
cur_pos = &mem_top[rest];
} else { /* Fits, just assign position in mem. */
cur_pos = cur_pos + i;
*cur_pos = '\0';
}
/* Calculate rest of mem */
rest = MEMORY_SIZE - (cur_pos - mem_top);
}
}
/* Reading file. Check size */
else if (size > rest) { /* Won't fit */
mread(fd, cur_pos, rest);
in_core = FALSE;
i = last_line(); /* Get pos. of last line */
mem_top[i] = '\0'; /* Truncate */
(void) lseek(fd, (off_t) (i - MEMORY_SIZE), SEEK_CUR); /* Do this next time */
size = size - rest - i + MEMORY_SIZE; /* Calculate rest */
cur_pos = mem_top; /* Reset mem */
sort(); /* Sort core */
get_file(fd, size); /* Get rest of file */
} else { /* Fits. Just read in */
rest = size;
mread(fd, cur_pos, rest);
cur_pos = cur_pos + rest; /* Reassign cur_pos */
*cur_pos = '\0';
(void) close(fd); /* File completed */
}
}
/* Last_line () find the last line in core and retuns the offset from the top
* of the memory.
*/
int last_line()
{
register int i;
for (i = MEMORY_SIZE - 2; i > 0; i--)
if (mem_top[i] == '\n') break;
return i + 1;
}
/* Print_table prints the line table in the given file_descriptor. If the fd
* equals ERROR, it opens a temp_file itself.
*/
void print_table(fd)
int fd;
{
register char **line_ptr; /* Ptr in line_table */
register char *ptr; /* Ptr to line */
int index = 0; /* Index in output buffer */
if (fd == ERROR) {
if ((fd = creat(file_name(nr_of_files), 0644)) < 0)
error(TRUE, "Cannot creat ", file_name(nr_of_files));
}
for (line_ptr = line_table; *line_ptr != NULL; line_ptr++) {
2005-04-21 16:53:53 +02:00
ptr = *line_ptr;
/* Skip all same lines if uniq is set */
if (uniq && *(line_ptr + 1) != NULL) {
2005-04-21 16:53:53 +02:00
if (compare(ptr, *(line_ptr + 1)) == SAME) continue;
}
do { /* Print line in a buffered way */
out_buffer[index++] = *ptr;
if (index == IO_SIZE) {
mwrite(fd, out_buffer, IO_SIZE);
index = 0;
}
} while (*ptr++ != '\n');
}
mwrite(fd, out_buffer, index);/* Flush buffer */
(void) close(fd); /* Close file */
nr_of_files++; /* Increment nr_of_files to merge */
}
/* File_name () returns the nr argument from the argument list, or a uniq
* filename if the nr is too high, or the arguments were not merge files.
*/
char *file_name(nr)
register int nr;
{
if (only_merge) {
if (args_offset + nr < args_limit) return argptr[args_offset + nr];
}
temp_files[16] = nr / 26 + 'a';
temp_files[17] = nr % 26 + 'a';
return temp_files;
}
/* Mread () performs a normal read (), but checks the return value. */
void mread(fd, address, bytes)
int fd;
char *address;
register int bytes;
{
if (read(fd, address, bytes) < 0 && bytes != 0)
error(TRUE, "Read error", NULL);
2005-04-21 16:53:53 +02:00
}
/* Mwrite () performs a normal write (), but checks the return value. */
void mwrite(fd, address, bytes)
int fd;
char *address;
register int bytes;
{
if (write(fd, address, bytes) != bytes && bytes != 0)
error(TRUE, "Write error", NULL);
2005-04-21 16:53:53 +02:00
}
/* Sort () sorts the input in memory starting at mem_top. */
void sort()
{
register char *ptr = mem_top;
register int count = 0;
/* Count number of lines in memory */
while (*ptr) {
if (*ptr++ == '\n') count++;
}
/* Set up the line table */
line_table = (char **) msbrk(count * sizeof(char *) + sizeof(char *));
count = 1;
ptr = line_table[0] = mem_top;
while (*ptr) {
if (*ptr++ == '\n') line_table[count++] = ptr;
}
line_table[count - 1] = NULL;
2005-04-21 16:53:53 +02:00
/* Sort the line table */
sort_table(count - 1);
/* Stash output somewhere */
if (in_core) {
open_outfile();
print_table(out_fd);
} else
print_table(ERROR);
/* Free line table */
mbrk((char *) line_table);
}
/* Sort_table () sorts the line table consisting of nel elements. */
void sort_table(nel)
register int nel;
{
char *tmp;
register int i;
/* Make heap */
for (i = (nel >> 1); i >= 1; i--) incr(i, nel);
/* Sort from heap */
for (i = nel; i > 1; i--) {
tmp = line_table[0];
line_table[0] = line_table[i - 1];
line_table[i - 1] = tmp;
incr(1, i - 1);
}
}
/* Incr () increments the heap. */
void incr(si, ei)
register int si, ei;
{
char *tmp;
while (si <= (ei >> 1)) {
si <<= 1;
if (si + 1 <= ei && compare(line_table[si - 1], line_table[si]) <= 0)
si++;
if (compare(line_table[(si >> 1) - 1], line_table[si - 1]) >= 0)
return;
tmp = line_table[(si >> 1) - 1];
line_table[(si >> 1) - 1] = line_table[si - 1];
line_table[si - 1] = tmp;
}
}
/* Cmp_fields builds new lines out of the lines pointed to by el1 and el2 and
* puts it into the line1 and line2 arrays. It then calls the cmp () routine
* with the field describing the arguments.
*/
int cmp_fields(el1, el2)
register char *el1, *el2;
{
int i, ret;
char line1[LINE_SIZE], line2[LINE_SIZE];
for (i = 0; i < field_cnt; i++) { /* Setup line parts */
build_field(line1, &fields[i + 1], el1);
build_field(line2, &fields[i + 1], el2);
if ((ret = cmp((unsigned char *) line1, (unsigned char *) line2,
&fields[i + 1])) != SAME)
break; /* If equal, try next field */
}
/* Check for reverse flag */
if (i != field_cnt && fields[i + 1].reverse) return -ret;
/* Else return the last return value of cmp () */
return ret;
}
/* Build_field builds a new line from the src as described by the field.
* The result is put in dest.
*/
void build_field(dest, field, src)
char *dest; /* Holds result */
register FIELD *field; /* Field description */
register char *src; /* Source line */
{
char *begin = src; /* Remember start location */
char *last; /* Pointer to end location */
int i;
/* Skip begin fields */
src = skip_fields(src, field->beg_field);
/* Skip begin positions */
for (i = 0; i < field->beg_pos && *src != '\n'; i++) src++;
/* Copy whatever is left */
copy(dest, src);
/* If end field is assigned truncate (perhaps) the part copied */
if (field->end_field != ERROR) { /* Find last field */
last = skip_fields(begin, field->end_field);
/* Skip positions as given by end fields description */
for (i = 0; i < field->end_pos && *last != '\n'; i++) last++;
dest[last - src] = '\n';/* Truncate line */
}
}
/* Skip_fields () skips nf fields of the line pointed to by str. */
char *skip_fields(str, nf)
register char *str;
int nf;
{
while (nf-- > 0) {
if (separator == '\0') {/* Means ' ' or '\t' */
while (*str != ' ' && *str != '\t' && *str != '\n') str++;
while (table[*str] & BLANK) str++;
} else {
while (*str != separator && *str != '\n') str++;
if (*str == separator) str++;
}
}
return str; /* Return pointer to indicated field */
}
/* Compare is called by all sorting routines. It checks if fields assignments
* has been made. if so, it calls cmp_fields (). If not, it calls cmp () and
* reversed the return value if the (global) reverse flag is set.
*/
int compare(el1, el2)
register char *el1, *el2;
{
int ret;
if (field_cnt > GLOBAL) return cmp_fields(el1, el2);
ret = cmp((unsigned char *) el1, (unsigned char *) el2, &fields[GLOBAL]);
return(fields[GLOBAL].reverse) ? -ret : ret;
}
/* Cmp () is the actual compare routine. It compares according to the
* description given in the field pointer.
*/
int cmp(el1, el2, field)
register unsigned char *el1, *el2;
FIELD *field;
{
int c1, c2;
if (field->blanks) { /* Skip leading blanks */
while (table[*el1] & BLANK) el1++;
while (table[*el2] & BLANK) el2++;
}
if (field->numeric) /* Compare numeric */
return digits((char *) el1, (char *) el2, TRUE);
for (;;) {
while (*el1 == *el2) {
if (*el1++ == '\n') /* EOLN on both strings */
return SAME;
el2++;
}
if (*el1 == '\n') /* EOLN on string one */
return LOWER;
if (*el2 == '\n') return HIGHER;
if (field->ascii) { /* Skip chars outside 040 - 0177 */
if ((table[*el1] & ASCII) == 0) {
do {
el1++;
} while ((table[*el1] & ASCII) == 0);
continue;
}
if ((table[*el2] & ASCII) == 0) {
do {
el2++;
} while ((table[*el2] & ASCII) == 0);
continue;
}
}
if (field->dictionary) {/* Skip non-dict chars */
if ((table[*el1] & DICT) == 0) {
do {
el1++;
} while ((table[*el1] & DICT) == 0);
continue;
}
if ((table[*el2] & DICT) == 0) {
do {
el2++;
} while ((table[*el2] & DICT) == 0);
continue;
}
}
if (field->fold_case) { /* Fold upper case to lower */
if (table[c1 = *el1++] & UPPER) c1 += 'a' - 'A';
if (table[c2 = *el2++] & UPPER) c2 += 'a' - 'A';
if (c1 == c2) continue;
return c1 - c2;
}
return *el1 - *el2;
}
/* NOTREACHED */
}
/*
* Digits compares () the two strings that point to a number of digits followed
* by an optional decimal point.
*/
int digits(str1, str2, check_sign)
register char *str1, *str2;
BOOL check_sign; /* True if sign must be checked */
{
BOOL negative = FALSE; /* True if negative numbers */
int diff, pow, ret;
/* Check for optional minus or plus sign */
if (check_sign) {
if (*str1 == '-') {
negative = TRUE;
str1++;
} else if (*str1 == '+')
str1++;
if (*str2 == '-') {
if (negative == FALSE) return HIGHER;
str2++;
} else if (negative)
return LOWER;
else if (*str2 == '+')
str2++;
}
/* Keep incrementing as long as digits are available and equal */
while ((table[*str1] & DIGIT) && table[*str2] & DIGIT) {
if (*str1 != *str2) break;
str1++;
str2++;
}
/* First check for the decimal point. */
if (*str1 == '.' || *str2 == '.') {
if (*str1 == '.') {
if (*str2 == '.') /* Both. Check decimal part */
ret = digits(str1 + 1, str2 + 1, FALSE);
else
ret = (table[*str2] & DIGIT) ? LOWER : HIGHER;
} else
ret = (table[*str1] & DIGIT) ? HIGHER : LOWER;
}
/* Now either two digits differ, or unknown char is seen (e.g. end of string) */
else if ((table[*str1] & DIGIT) && (table[*str2] & DIGIT)) {
diff = *str1 - *str2; /* Basic difference */
pow = 0; /* Check power of numbers */
while (table[*str1++] & DIGIT) pow++;
while (table[*str2++] & DIGIT) pow--;
ret = (pow == 0) ? diff : pow;
}
/* Unknown char. Check on which string it occurred */
else {
if ((table[*str1] & DIGIT) == 0)
ret = (table[*str2] & DIGIT) ? LOWER : SAME;
else
ret = HIGHER;
}
/* Reverse sense of comparisons if negative is true. (-1000 < -1) */
return(negative) ? -ret : ret;
}
/* Files_merge () merges all files as indicated by nr_of_files. Merging goes
* in numbers of files that can be opened at the same time. (OPEN_FILES)
*/
void files_merge(file_cnt)
register int file_cnt; /* Nr_of_files to merge */
{
register int i;
int limit;
for (i = 0; i < file_cnt; i += OPEN_FILES) {
/* Merge last files and store in output file */
if ((limit = i + OPEN_FILES) >= file_cnt) {
open_outfile();
limit = file_cnt;
} else { /* Merge OPEN_FILES files and store in temp
* file */
temp_files[16] = file_cnt / 26 + 'a';
temp_files[17] = file_cnt % 26 + 'a';
if ((out_fd = creat(temp_files, 0644)) < 0)
error(TRUE, "Cannot creat ", temp_files);
file_cnt++;
}
merge(i, limit);
}
/* Cleanup mess */
i = (only_merge) ? args_limit - args_offset : 0;
while (i < file_cnt) (void) unlink(file_name(i++));
}
/* Merge () merges the files between start_file and limit_file. */
void merge(start_file, limit_file)
int start_file, limit_file;
{
register MERGE *smallest; /* Keeps track of smallest line */
register int i;
int file_cnt = limit_file - start_file; /* Nr of files to merge */
/* Calculate size in core available for file_cnt merge structs */
buf_size = MEMORY_SIZE / file_cnt - LINE_SIZE;
mbrk(mem_top); /* First reset mem to lowest loc. */
disabled = 0; /* All files not done yet */
/* Set up merge structures. */
for (i = start_file; i < limit_file; i++) {
smallest = &merge_f[i - start_file];
if (!strcmp(file_name(i), "-")) /* File is stdin */
smallest->fd = 0;
else if ((smallest->fd = open(file_name(i), O_RDONLY)) < 0) {
smallest->fd = ERROR;
error(FALSE, "Cannot open ", file_name(i));
disabled++; /* Done this file */
continue;
}
smallest->buffer = msbrk(buf_size);
smallest->line = msbrk(LINE_SIZE);
smallest->cnt = smallest->read_chars = 0;
(void) read_line(smallest); /* Read first line */
}
if (disabled == file_cnt) { /* Couldn't open files */
(void) close(out_fd);
return;
}
/* Find a merg struct to assign smallest. */
for (i = 0; i < file_cnt; i++) {
if (merge_f[i].fd != ERROR) {
smallest = &merge_f[i];
break;
}
}
/* Loop until all files minus one are done */
while (disabled < file_cnt - 1) {
if (uniq) /* Skip all same lines */
smallest = skip_lines(smallest, file_cnt);
else { /* Find smallest line */
for (i = 0; i < file_cnt; i++) {
if (merge_f[i].fd == ERROR)
continue; /* We've had this one */
if (compare(merge_f[i].line, smallest->line) < 0)
smallest = &merge_f[i];
}
} /* Print line and read next */
smallest = print(smallest, file_cnt);
}
if (only_merge && uniq)
uniq_lines(smallest); /* Print only uniq lines */
else /* Print rest of file */
while (print(smallest, file_cnt) != NULL);
2005-04-21 16:53:53 +02:00
put_line(NULL); /* Flush output buffer */
2005-04-21 16:53:53 +02:00
}
/* Put_line () prints the line into the out_fd filedescriptor. If line equals
* NULL, the out_fd is flushed and closed.
2005-04-21 16:53:53 +02:00
*/
void put_line(line)
register char *line;
{
static int index = 0; /* Index in out_buffer */
if (line == NULL) { /* Flush and close */
2005-04-21 16:53:53 +02:00
mwrite(out_fd, out_buffer, index);
index = 0;
(void) close(out_fd);
return;
}
do { /* Fill out_buffer with line */
out_buffer[index++] = *line;
if (index == IO_SIZE) {
mwrite(out_fd, out_buffer, IO_SIZE);
index = 0;
}
} while (*line++ != '\n');
}
/*
* Print () prints the line of the merg structure and tries to read another one.
* If this fails, it returns the next merg structure which file_descriptor is
* still open. If none could be found, a NIL structure is returned.
*/
MERGE *print(merg, file_cnt)
register MERGE *merg;
int file_cnt; /* Nr of files that are being merged */
{
register int i;
put_line(merg->line); /* Print the line */
if (read_line(merg) == ERROR) { /* Read next line */
for (i = 0; i < file_cnt; i++) {
if (merge_f[i].fd != ERROR) {
merg = &merge_f[i];
break;
}
}
if (i == file_cnt) /* No more files left */
return NULL;
2005-04-21 16:53:53 +02:00
}
return merg;
}
/* Read_line () reads a line from the fd from the merg struct. If the read
* failed, disabled is incremented and the file is closed. Readings are
* done in buf_size bytes.
* Lines longer than LINE_SIZE are silently truncated.
*/
int read_line(merg)
register MERGE *merg;
{
register char *ptr = merg->line - 1; /* Ptr buf that will hold line */
do {
ptr++;
if (merg->cnt == merg->read_chars) { /* Read new buffer */
if ((merg->read_chars =
read(merg->fd, merg->buffer, buf_size)) <= 0) {
(void) close(merg->fd); /* OOPS */
merg->fd = ERROR;
disabled++;
return ERROR;
}
merg->cnt = 0;
}
*ptr = merg->buffer[merg->cnt++]; /* Assign next char of line */
if (ptr - merg->line == LINE_SIZE - 1)
*ptr = '\n'; /* Truncate very long lines */
} while (*ptr != '\n' && *ptr != '\0');
if (*ptr == '\0') /* Add '\n' to last line */
*ptr = '\n';
*++ptr = '\0'; /* Add '\0' */
return OK;
}
/* Skip_lines () skips all same lines in all the files currently being merged.
* It returns a pointer to the merge struct containing the smallest line.
*/
MERGE *skip_lines(smallest, file_cnt)
register MERGE *smallest;
int file_cnt;
{
register int i;
int ret;
if (disabled == file_cnt - 1) /* We've had all */
return smallest;
for (i = 0; i < file_cnt; i++) {
if (merge_f[i].fd == ERROR || smallest == &merge_f[i])
continue; /* Don't check same file */
while ((ret = compare(merge_f[i].line, smallest->line)) == 0) {
if (read_line(&merge_f[i]) == ERROR) break; /* EOF */
}
if (ret < 0) /* Line wasn't smallest. Try again */
return skip_lines(&merge_f[i], file_cnt);
}
return smallest;
}
/* Uniq_lines () prints only the uniq lines out of the fd of the merg struct. */
void uniq_lines(merg)
register MERGE *merg;
{
char lastline[LINE_SIZE]; /* Buffer to hold last line */
for (;;) {
put_line(merg->line); /* Print this line */
copy(lastline, merg->line); /* and save it */
if (read_line(merg) == ERROR) /* Read the next */
return;
/* Keep reading until lines duffer */
while (compare(lastline, merg->line) == SAME)
if (read_line(merg) == ERROR) return;
}
/* NOTREACHED */
}
/*
* Check_file () checks if a file is sorted in order according to the arguments
* given in main ().
*/
void check_file(fd, file)
int fd;
char *file;
{
register MERGE *merg; /* 1 file only */
char lastline[LINE_SIZE]; /* Save last line */
register int ret; /* ret status of compare */
if (fd == 0) file = "stdin";
merg = (MERGE *) mem_top; /* Assign MERGE structure */
merg->buffer = mem_top + sizeof(MERGE);
merg->line = msbrk(LINE_SIZE);
merg->cnt = merg->read_chars = 0;
merg->fd = fd;
buf_size = MEMORY_SIZE - sizeof(MERGE);
if (read_line(merg) == ERROR) /* Read first line */
return;
copy(lastline, merg->line); /* and save it */
for (;;) {
if (read_line(merg) == ERROR) /* EOF reached */
break;
if ((ret = compare(lastline, merg->line)) > 0) {
error(FALSE, "Disorder in file ", file);
write(2, merg->line, length(merg->line));
break;
} else if (ret < 0) /* Copy if lines not equal */
copy(lastline, merg->line);
else if (uniq) {
error(FALSE, "Non uniq line in file ", file);
write(2, merg->line, length(merg->line));
break;
}
}
mbrk(mem_top); /* Reset mem */
}
/* Length () returns the length of the argument line including the linefeed. */
int length(line)
register char *line;
{
register int i = 1; /* Add linefeed */
while (*line++ != '\n') i++;
return i;
}
/* Copy () copies the src line into the dest line including linefeed. */
void copy(dest, src)
register char *dest, *src;
{
while ((*dest++ = *src++) != '\n');
}
/* Msbrk() does a sbrk() and checks the return value. */
char *msbrk(size)
register int size;
{
register char *address;
if ((address = sbrk(size)) == (char *) -1)
error(TRUE, "Not enough memory. Use chmem to allocate more", NULL);
2005-04-21 16:53:53 +02:00
return address;
}
/* Mbrk() does a brk() and checks the return value. */
void mbrk(address)
char *address;
{
if (brk(address) == -1) error(TRUE, "Cannot reset memory", NULL);
2005-04-21 16:53:53 +02:00
}
void catch(dummy)
int dummy; /* to satisfy the prototype */
{
register int i;
signal(SIGINT, SIG_IGN);
only_merge = FALSE;
for (i = 0; i < 26; i++) (void) unlink(file_name(i));
exit(2);
}