From: Martin Mares Date: Mon, 23 Jul 2012 22:28:06 +0000 (+0200) Subject: Added two-pass code and --table format X-Git-Tag: v1.0~40 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=4df21758270aa34348eca27f096992d2a1511f48;p=xsv.git Added two-pass code and --table format --- diff --git a/Makefile b/Makefile index bc54c01..f5fcd54 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ PCRE_CFLAGS:=$(shell pcre-config --cflags) PCRE_LIBS:=$(shell pcre-config --libs) -CFLAGS=-O2 -Wall -W -Wno-parentheses -Wstrict-prototypes -Wmissing-prototypes -Wundef -Wredundant-decls -std=gnu99 $(PCRE_CFLAGS) +CFLAGS=-O2 -Wall -W -Wno-parentheses -Wstrict-prototypes -Wmissing-prototypes -Wundef -Wredundant-decls -std=gnu99 $(PCRE_CFLAGS) -g LDLIBS=$(PCRE_LIBS) all: xsv diff --git a/xsv.c b/xsv.c index 6263da0..5bf4f01 100644 --- a/xsv.c +++ b/xsv.c @@ -24,6 +24,13 @@ static void *xmalloc(size_t bytes) return p; } +static void *xmalloc_zero(size_t bytes) +{ + void *p = xmalloc(bytes); + memset(p, 0, bytes); + return p; +} + static void *xrealloc(void *old, size_t bytes) { void *p = realloc(old, bytes); @@ -51,6 +58,8 @@ static void *xrealloc(void *old, size_t bytes) static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \ // end +DECLARE_BUF(intarray, int); + /*** Formats and their parameters ***/ enum format_id { @@ -59,6 +68,8 @@ enum format_id { FORM_CSV, FORM_WS, FORM_REGEX, + FORM_TMP, + FORM_TABLE, }; struct format { @@ -68,16 +79,28 @@ struct format { int quiet; int (*read_line)(void); void (*write_line)(void); + int needs_two_passes; + // CSV backend: int always_quote; + // WS backend: int strict_ws; + // regex backend: pcre *pcre; pcre_extra *pcre_extra; + + // Temporary file backend: + FILE *tmp_file; + intarray_t column_widths; + + // Table backend: + int table_sep; }; static struct format *in_format, *out_format; +static int want_trim; struct field { int start_pos; @@ -131,6 +154,8 @@ static int next_line(void) } } +/*** CSV/TSV back-end */ + static int csv_read(void) { int quoted = 0; @@ -210,6 +235,8 @@ static void csv_write(void) putchar('\n'); } +/*** White-space back-end ***/ + static int ws_read(void) { if (!next_line()) @@ -245,6 +272,8 @@ static int ws_read(void) return 1; } +/*** Regex back-end ***/ + static const char *regex_set(struct format *f, char *rx) { const char *err; @@ -288,6 +317,93 @@ static int regex_read(void) } } +/*** Table back-end ***/ + +static void table_write(void) +{ + for (int i = 0; i < fields_count(&in_fields); i++) { + if (i) + printf("%*s", out_format->table_sep, ""); + struct field *f = fields_nth(&in_fields, i); + int w = *intarray_nth(&in_format->column_widths, i); + if (f->len > w) { + warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", f->len, w); + w = f->len; + } + int j = 0; + unsigned char *p = line_nth(&in_line, f->start_pos); + while (j < f->len) { + putchar(*p++); + j++; + } + while (j < w) { + putchar(' '); + j++; + } + } + putchar('\n'); +} + +/*** Temporary file back-end ***/ + +static int tmp_read(void) +{ + FILE *tf = in_format->tmp_file; + + for (;;) { + int c = fgetc(tf); + if (c < 0) + return 0; + if (c == 0xff) + return 1; + if (c == 0xfe) { + c = fgetc(tf); + c = (c << 8) | fgetc(tf); + c = (c << 8) | fgetc(tf); + c = (c << 8) | fgetc(tf); + } + new_field(line_count(&in_line)); + in_field->len = c; + while (c--) { + int x = fgetc(tf); + if (x < 0) { + warn(in_format, "Truncated temporary file"); + return 0; + } + *line_push(&in_line) = x; + } + } +} + +static void tmp_write(void) +{ + FILE *tf = out_format->tmp_file; + + for (int i = 0; i < fields_count(&in_fields); i++) { + struct field *f = fields_nth(&in_fields, i); + if (f->len < 0xfe) + fputc(f->len, tf); + else { + fputc(0xfe, tf); + fputc((f->len >> 24) & 0xff, tf); + fputc((f->len >> 16) & 0xff, tf); + fputc((f->len >> 8) & 0xff, tf); + fputc(f->len & 0xff, tf); + } + + unsigned char *p = line_nth(&in_line, f->start_pos); + for (int j = 0; j < f->len; j++) + fputc(*p++, tf); + + intarray_t *w = &out_format->column_widths; + while (i >= intarray_count(w)) + *intarray_push(w) = 0; + if (*intarray_nth(w, i) < f->len) + *intarray_nth(w, i) = f->len; + } + fputc(0xff, tf); +} + /*** Transforms ***/ static void trim_fields(void) @@ -354,6 +470,50 @@ static void select_fields(void) } } +/*** Processing of files ***/ + +static void one_pass(void) +{ + line_number = 0; + for (;;) { + line_number++; + fields_reset(&in_fields); + line_reset(&in_line); + in_field = NULL; + if (!in_format->read_line()) + break; + + if (want_trim) + trim_fields(); + + fields_reset(&out_fields); + select_fields(); + + out_format->write_line(); + } +} + +static void two_pass(void) +{ + struct format *final_format = out_format; + + // Pass 1: Set up writer of intermediate format + out_format = xmalloc_zero(sizeof(*out_format)); + out_format->id = FORM_TMP; + out_format->read_line = tmp_read; + out_format->write_line = tmp_write; + out_format->tmp_file = tmpfile(); + intarray_init(&out_format->column_widths); + one_pass(); + + // Pass 2: Set up reader of intermediate format + in_format = out_format; + rewind(in_format->tmp_file); + out_format = final_format; + one_pass(); + fclose(in_format->tmp_file); +} + /*** Parsing of arguments ***/ static void usage(void) @@ -367,11 +527,13 @@ Formats:\n\ -w, --ws Values separated by arbitrary whitespace\n\ -W, --strict-ws Like --ws, but recognize empty columns at start/end\n\ -r, --regex= Separator given by Perl regular expression (input only)\n\ + --table Format a table (output only)\n\ \n\ Format parameters:\n\ -d, --fs= Delimiter of fields\n\ -q, --quiet Do not show warnings\n\ --always-quote Put quotes around all fields (CSV output only)\n\ + --table-sep= Separate table columns by spaces (default: 2)\n\ \n\ Other options:\n\ --trim Trim leading and trailing whitespaces in fields\n\ @@ -399,6 +561,8 @@ enum long_options { OPT_HELP = 256, OPT_TRIM, OPT_ALWAYS_QUOTE, + OPT_TABLE, + OPT_TABLE_SEP, }; static const struct option long_options[] = { @@ -408,6 +572,8 @@ static const struct option long_options[] = { { "quiet", 0, NULL, 'q' }, { "regex", 1, NULL, 'r' }, { "strict-ws", 0, NULL, 'W' }, + { "table", 0, NULL, OPT_TABLE }, + { "table-sep", 1, NULL, OPT_TABLE_SEP }, { "trim", 0, NULL, OPT_TRIM }, { "tsv", 0, NULL, 't' }, { "ws", 0, NULL, 'w' }, @@ -417,8 +583,7 @@ static const struct option long_options[] = { static void set_format(int format_id) { - struct format *f = xmalloc(sizeof(*f)); - memset(f, 0, sizeof(*f)); + struct format *f = xmalloc_zero(sizeof(*f)); f->id = format_id; switch (format_id) { @@ -443,6 +608,11 @@ static void set_format(int format_id) case FORM_REGEX: f->read_line = regex_read; break; + case FORM_TABLE: + f->write_line = table_write; + f->needs_two_passes = 1; + f->table_sep = 2; + break; } if (!in_format) @@ -450,7 +620,7 @@ static void set_format(int format_id) else if (!out_format) out_format = f; else - bad_args("At most two format may be given."); + bad_args("At most two formats may be given."); } static struct format *current_format(void) @@ -466,7 +636,6 @@ static struct format *current_format(void) int main(int argc, char **argv) { int opt; - int want_trim = 0; const char *err; while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0) @@ -509,6 +678,12 @@ int main(int argc, char **argv) case OPT_TRIM: want_trim = 1; break; + case OPT_TABLE: + set_format(FORM_TABLE); + break; + case OPT_TABLE_SEP: + current_format()->table_sep = atoi(optarg); + break; default: bad_args(NULL); } @@ -532,22 +707,9 @@ int main(int argc, char **argv) fields_init(&out_fields); line_init(&in_line); - for (;;) { - line_number++; - fields_reset(&in_fields); - line_reset(&in_line); - in_field = NULL; - if (!in_format->read_line()) - break; - - if (want_trim) - trim_fields(); - - fields_reset(&out_fields); - select_fields(); - - out_format->write_line(); - } - + if (out_format->needs_two_passes) + two_pass(); + else + one_pass(); return 0; }