X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=xsv.c;h=b815c7205a774d09d88335496bde4bc7b38c584a;hb=aa30205e8933e981fab14a9ea284b3b7d9a2a12c;hp=4ad9e477714db146b918eb600e8f5ba4d9f10301;hpb=670598a4db2f60f84e14593bb48c3fe039328a67;p=xsv.git diff --git a/xsv.c b/xsv.c index 4ad9e47..b815c72 100644 --- a/xsv.c +++ b/xsv.c @@ -1,5 +1,5 @@ /* - * A Swiss-Army Knife for CSV-like Files + * The Swiss-Army Knife for CSV-like Files * * (c) 2012 Martin Mares */ @@ -9,28 +9,57 @@ #include #include #include +#include +#include #include +#ifdef __GNUC__ +#define NONRET __attribute__((noreturn)) +#define UNUSED __attribute__((unused)) +#else +#define NONRET +#define UNUSED +#endif + +static void select_fields(void); +static void select_all_fields(void); + +/*** General functions ***/ + +static void NONRET die(char *msg, ...) +{ + va_list args; + va_start(args, msg); + fprintf(stderr, "xsv: "); + vfprintf(stderr, msg, args); + fputc('\n', stderr); + va_end(args); + exit(1); +} + /*** Memory allocation ***/ static void *xmalloc(size_t bytes) { void *p = malloc(bytes); - if (!p) { - fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes); - exit(1); - } + if (!p) + die("Out of memory (cannot allocate %zu bytes)", bytes); + return p; +} + +static void *xmalloc_zero(size_t bytes) +{ + void *p = xmalloc(bytes); + memset(p, 0, bytes); return p; } static void *xrealloc(void *old, size_t bytes) { void *p = realloc(old, bytes); - if (!p) { - fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes); - exit(1); - } + if (!p) + die("Out of memory (cannot allocate %zu bytes)", bytes); return p; } @@ -51,6 +80,9 @@ static void *xrealloc(void *old, size_t bytes) static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \ // end +DECLARE_BUF(intarray, int); +DECLARE_BUF(stringarray, char *); + /*** Formats and their parameters ***/ enum format_id { @@ -59,6 +91,8 @@ enum format_id { FORM_CSV, FORM_WS, FORM_REGEX, + FORM_TMP, + FORM_TABLE, }; struct format { @@ -66,16 +100,34 @@ struct format { int fs; int quote; int quiet; - int (*read_line)(void); - void (*write_line)(void); + int sloppy; + int (*read_line)(struct format *fmt); + void (*write_line)(struct format *fmt); + void (*write_grid)(struct format *fmt, int pos); // -1=above, 1=below, 0=after header + int needs_stats; + + // Field names + int has_header; + char *set_field_names; + struct field_names *field_names; + // CSV backend: int always_quote; + // regex backend: pcre *pcre; pcre_extra *pcre_extra; + + // Temporary file backend: + FILE *tmp_file; + + // Table backend: + int table_sep; + int table_grid; }; static struct format *in_format, *out_format; +static int want_trim, want_equalize, want_stats; struct field { int start_pos; @@ -90,6 +142,34 @@ static struct field *in_field; static line_t in_line; static int line_number; +static int read_line(void) +{ + fields_reset(&in_fields); + line_reset(&in_line); + in_field = NULL; + if (!in_format->read_line(in_format)) + return 0; + if (ferror_unlocked(stdin)) + die("I/O error when reading standard input"); + return 1; +} + +static void write_line(void) +{ + out_format->write_line(out_format); + if (ferror_unlocked(stdout)) + die("I/O error when writing standard input"); +} + +static void write_grid(int pos) +{ + if (out_format->write_grid) { + out_format->write_grid(out_format, pos); + if (ferror_unlocked(stdout)) + die("I/O error when writing standard input"); + } +} + static void new_field(int pos) { in_field = fields_push(&in_fields); @@ -103,13 +183,20 @@ static void ensure_field(int pos) new_field(pos); } +static unsigned char *get_field(fields_t *fields, int i, int *len) +{ + struct field *f = fields_nth(fields, i); + *len = f->len; + return line_nth(&in_line, f->start_pos); +} + static void warn(struct format *fmt, char *msg, ...) { if (!fmt->quiet) { fprintf(stderr, "Warning at line %d: ", line_number); va_list args; va_start(args, msg); - vfprintf(stderr, args, msg); + vfprintf(stderr, msg, args); va_end(args); fputc('\n', stderr); } @@ -118,7 +205,7 @@ static void warn(struct format *fmt, char *msg, ...) static int next_line(void) { for (;;) { - int c = getchar(); + int c = getchar_unlocked(); if (c == '\r') continue; if (c < 0) @@ -129,37 +216,79 @@ static int next_line(void) } } -static int csv_read(void) +static int field_chars(struct field *f) +{ + unsigned char *s = line_nth(&in_line, f->start_pos); + int i = 0; + mbstate_t mbs; + memset(&mbs, 0, sizeof(mbs)); + + int chars = 0; + while (i < f->len) { + size_t k = mbrlen((char *) s + i, f->len - i, &mbs); + if ((int) k <= 0) + break; + i += k; + chars++; + } + + return chars; +} + +/*** Field statistics ***/ + +static intarray_t column_widths; + +static void update_stats(void) +{ + if (!want_stats) + return; + + for (int i = 0; i < fields_count(&out_fields); i++) { + struct field *f = fields_nth(&out_fields, i); + intarray_t *w = &column_widths; + + while (i >= intarray_count(w)) + *intarray_push(w) = 0; + int fw = field_chars(f); + if (*intarray_nth(w, i) < fw) + *intarray_nth(w, i) = fw; + } +} + +/*** CSV/TSV back-end */ + +static int csv_read(struct format *fmt) { int quoted = 0; for (;;) { - int c = getchar(); + int c = getchar_unlocked(); int i = line_count(&in_line); restart: if (c == '\r') continue; if (c < 0 || c == '\n') { if (quoted) - warn(in_format, "Missing closing quote."); + warn(fmt, "Missing closing quote."); if (c < 0) return !!fields_count(&in_fields); else return 1; } if (quoted) { - if (c == in_format->quote) { - c = getchar(); - if (c != in_format->quote) { + if (c == fmt->quote) { + c = getchar_unlocked(); + if (c != fmt->quote) { quoted = 0; goto restart; } // Two quotes assimilate to one } // Fall through to pushing the character - } else if (c == in_format->quote) { + } else if (c == fmt->quote) { quoted = 1; continue; - } else if (c == in_format->fs && !quoted) { + } else if (c == fmt->fs && !quoted) { ensure_field(i); new_field(i); continue; @@ -175,40 +304,41 @@ static int is_ws(int c) return (c == ' ' || c == '\t' || c == '\f'); } -static void csv_write(void) +static void csv_write(struct format *fmt) { - unsigned char *line = line_first(&in_line); - int n = fields_count(&out_fields); - for (int i=0; iquote >= 0) { - need_quotes = out_format->always_quote; - for (int j=0; !need_quotes && j < f->len; j++) { - int c = line[f->start_pos + j]; - if (c == out_format->fs || c == out_format->quote) + if (fmt->quote >= 0) { + need_quotes = fmt->always_quote; + for (int j=0; !need_quotes && j < len; j++) { + if (p[j] == fmt->fs || p[j] == fmt->quote) need_quotes = 1; } } if (i) - putchar(out_format->fs); + putchar_unlocked(fmt->fs); if (need_quotes) - putchar(out_format->quote); - for (int j=0; j < f->len; j++) { - int c = line[f->start_pos + j]; - if (c == out_format->fs && !need_quotes) - warn(out_format, "Field separator found inside field and quoting is turned off."); - if (c == out_format->quote) - putchar(c); - putchar(c); + putchar_unlocked(fmt->quote); + for (int j=0; j < len; j++) { + int c = p[j]; + if (c == fmt->fs && !need_quotes) + warn(fmt, "Field separator found inside field and quoting is turned off."); + if (c == fmt->quote) + putchar_unlocked(c); + putchar_unlocked(c); } if (need_quotes) - putchar(out_format->quote); + putchar_unlocked(fmt->quote); } - putchar('\n'); + putchar_unlocked('\n'); } -static int ws_read(void) +/*** White-space back-end ***/ + +static int ws_read(struct format *fmt) { if (!next_line()) return 0; @@ -226,18 +356,25 @@ static int ws_read(void) ws++; } else { if (ws) { - new_field(i); + if (!in_field->start_pos && + !in_field->len && + fmt->sloppy) + in_field->start_pos = i; + else + new_field(i); ws = 0; } in_field->len++; } } - if (ws) + if (ws && !fmt->sloppy) new_field(n); return 1; } +/*** Regex back-end ***/ + static const char *regex_set(struct format *f, char *rx) { const char *err; @@ -253,7 +390,7 @@ static const char *regex_set(struct format *f, char *rx) return NULL; } -static int regex_read(void) +static int regex_read(struct format *fmt) { if (!next_line()) return 0; @@ -266,21 +403,143 @@ static int regex_read(void) int i = 0; for (;;) { int ovec[3]; - int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3); - if (sep < 0) { - if (sep != PCRE_ERROR_NOMATCH) - warn(in_format, "PCRE matching error %d", sep); + int err = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3); + if (err < 0) { + if (err != PCRE_ERROR_NOMATCH) + warn(fmt, "PCRE matching error %d", err); // No further occurrence of the separator: the rest is a single field + if (!fmt->sloppy || i < n) { + new_field(i); + in_field->len = n - i; + } + return 1; + } + if (ovec[0] == ovec[1]) { + warn(fmt, "Regular expression matched an empty separator."); new_field(i); in_field->len = n - i; return 1; } - new_field(i); - in_field->len = ovec[0] - i; + if (!fmt->sloppy || ovec[0]) { + new_field(i); + in_field->len = ovec[0] - i; + } i = ovec[1]; } } +/*** Table back-end ***/ + +static void table_write(struct format *fmt) +{ + for (int i = 0; i < intarray_count(&column_widths); i++) { + if (fmt->table_grid) { + putchar_unlocked('|'); + printf("%*s", fmt->table_sep / 2, ""); + } else if (i) + printf("%*s", fmt->table_sep, ""); + + int cw = *intarray_nth(&column_widths, i); + int fw = 0; + if (i < fields_count(&out_fields)) { + int len; + unsigned char *p = get_field(&out_fields, i, &len); + fw = field_chars(fields_nth(&out_fields, i)); + if (fw > cw) { + warn(fmt, "Internal error: Wrongly calculated width of column %d (%d > %d)", i, fw, cw); + cw = fw; + } + while (len--) + putchar(*p++); + } + while (fw < cw) { + putchar_unlocked(' '); + fw++; + } + + if (fmt->table_grid) + printf("%*s", fmt->table_sep - fmt->table_sep / 2, ""); + } + + if (fmt->table_grid) + putchar_unlocked('|'); + putchar_unlocked('\n'); +} + +static void table_write_grid(struct format *fmt, int pos UNUSED) +{ + if (!fmt->table_grid) + return; + + for (int i = 0; i < intarray_count(&column_widths); i++) { + putchar_unlocked('+'); + int w = fmt->table_sep + *intarray_nth(&column_widths, i); + while (w--) + putchar('-'); + } + putchar_unlocked('+'); + putchar_unlocked('\n'); +} + +/*** Temporary file back-end ***/ + +static int tmp_read(struct format *fmt) +{ + FILE *tf = fmt->tmp_file; + + for (;;) { + int c = getc_unlocked(tf); + if (c < 0) + return 0; + if (c == 0xff) + return 1; + if (c == 0xfe) { + c = getc_unlocked(tf); + c = (c << 8) | getc_unlocked(tf); + c = (c << 8) | getc_unlocked(tf); + c = (c << 8) | getc_unlocked(tf); + } + new_field(line_count(&in_line)); + in_field->len = c; + while (c--) { + int x = getc_unlocked(tf); + if (x < 0) + die("Truncated temporary file"); + *line_push(&in_line) = x; + } + } + + if (ferror_unlocked(tf)) + die("I/O error when reading temporary file"); +} + +static void tmp_write(struct format *fmt) +{ + FILE *tf = fmt->tmp_file; + + for (int i = 0; i < fields_count(&out_fields); i++) { + int len; + unsigned char *p = get_field(&out_fields, i, &len); + + if (len < 0xfe) + putc_unlocked(len, tf); + else { + putc_unlocked(0xfe, tf); + putc_unlocked((len >> 24) & 0xff, tf); + putc_unlocked((len >> 16) & 0xff, tf); + putc_unlocked((len >> 8) & 0xff, tf); + putc_unlocked(len & 0xff, tf); + } + + while (len--) + putc_unlocked(*p++, tf); + } + putc_unlocked(0xff, tf); + + if (ferror_unlocked(tf)) + die("I/O error when writing temporary file"); +} + /*** Transforms ***/ static void trim_fields(void) @@ -295,15 +554,166 @@ static void trim_fields(void) } } +static void equalize_fields(void) +{ + while (fields_count(&out_fields) < intarray_count(&column_widths)) { + struct field *f = fields_push(&out_fields); + f->start_pos = f->len = 0; + } +} + +/*** Field names and headers ***/ + +struct field_names { + stringarray_t names; +}; + +static void add_field(struct field_names *fn, char *name, int namelen) +{ + char *n = xmalloc(namelen + 1); + memcpy(n, name, namelen); + n[namelen] = 0; + *stringarray_push(&fn->names) = n; +} + +static void add_field_names(struct field_names *fn, char *names) +{ + char *p = names; + while (p) { + char *q = strchr(p, ','); + int len = q ? q-p : (int) strlen(p); + add_field(fn, p, len); + p = q ? q+1 : NULL; + } +} + +static void read_header(void) +{ + if (!(in_format->has_header || in_format->set_field_names)) + return; + + struct field_names *fn = xmalloc_zero(sizeof(*fn)); + in_format->field_names = fn; + + if (in_format->has_header) { + if (!read_line()) + die("Missing input header"); + } + + if (in_format->set_field_names) { + add_field_names(fn, in_format->set_field_names); + } else { + for (int i = 0; i < fields_count(&in_fields); i++) { + int len; + char *s = (char *) get_field(&in_fields, i, &len); + add_field(fn, s, len); + } + } +} + +static void write_header(void) +{ + if (!out_format->has_header) { + write_grid(-1); + return; + } + + int want_select_fields = 0; + if (out_format->set_field_names) { + struct field_names *fn = xmalloc_zero(sizeof(*fn)); + out_format->field_names = fn; + add_field_names(fn, out_format->set_field_names); + } else if (in_format->field_names) { + out_format->field_names = in_format->field_names; + want_select_fields = 1; + } else + die("Output header requested, but no field names specified"); + + line_reset(&in_line); + fields_reset(&in_fields); + struct field_names *fn = out_format->field_names; + for (int i = 0; i < stringarray_count(&fn->names); i++) { + struct field *f = fields_push(&in_fields); + f->start_pos = line_count(&in_line); + f->len = 0; + char *s = *stringarray_nth(&fn->names, i); + while (*s) { + *line_push(&in_line) = *s++; + f->len++; + } + } + + fields_reset(&out_fields); + if (want_select_fields) + select_fields(); + else + select_all_fields(); + + // This is tricky: when we are formatting a table, field names are normally + // calculated in pass 1, but the header is written in pass 2, so we have to + // update column statistics, because field name can be too wide to fit. + want_stats++; + update_stats(); + want_stats--; + if (want_equalize) + equalize_fields(); + write_grid(-1); + write_line(); + write_grid(0); +} + +static void write_footer(void) +{ + write_grid(1); +} + +static int find_field_by_name(struct field_names *fn, char *name) +{ + for (int i = 0; i < stringarray_count(&fn->names); i++) + if (!strcmp(*stringarray_nth(&fn->names, i), name)) + return i + 1; + return -1; +} + /*** Field selection ***/ struct selector { - int first_field, last_field; + int first_field, last_field; // 0 means "boundary" }; DECLARE_BUF(selectors, struct selector); static selectors_t selectors; +static int parse_field_num(char *str) +{ + int f = 0; + + while (*str) { + if (*str < '0' || *str > '9') + return -1; + if (f >= 100000000) + return -1; + f = 10*f + *str - '0'; + str++; + } + return f; +} + +static int parse_field(char *str) +{ + if (!*str) + return 0; + + int f = parse_field_num(str); + if (f > 0) + return f; + + if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0) + return f; + + die("Unknown field `%s'", str); +} + static char *parse_selector(char *str) { char buf[strlen(str) + 1]; @@ -313,10 +723,10 @@ static char *parse_selector(char *str) char *sep = strchr(buf, '-'); if (sep) { *sep++ = 0; - s->first_field = atoi(buf); - s->last_field = atoi(sep); + s->first_field = parse_field(buf); + s->last_field = parse_field(sep); } else - s->first_field = s->last_field = atoi(buf); + s->first_field = s->last_field = parse_field(buf); return NULL; } @@ -347,31 +757,101 @@ static void select_fields(void) } } +static void select_all_fields(void) +{ + for (int i = 0; i < fields_count(&in_fields); i++) + *fields_push(&out_fields) = *fields_nth(&in_fields, i); +} + +/*** Processing of files ***/ + +static void one_pass(int pass) +{ + if (pass & 2) + write_header(); + + for (;;) { + line_number++; + if (!read_line()) + break; + + if (want_trim && (pass & 1)) + trim_fields(); + + fields_reset(&out_fields); + if (pass & 1) + select_fields(); + else + select_all_fields(); + + if (want_equalize && (pass & 2)) + equalize_fields(); + update_stats(); + write_line(); + } + + if (pass & 2) + write_footer(); +} + +static void two_pass(void) +{ + struct format *final_format = out_format; + + // We need to use character set info from the current locale + setlocale(LC_CTYPE, ""); + + // Pass 1: Set up writer of intermediate format + out_format = xmalloc_zero(sizeof(*out_format)); + out_format->id = FORM_TMP; + out_format->read_line = tmp_read; + out_format->write_line = tmp_write; + out_format->tmp_file = tmpfile(); + out_format->field_names = in_format->field_names; + one_pass(1); + + // Pass 2: Set up reader of intermediate format + in_format = out_format; + rewind(in_format->tmp_file); + line_number = 0; + out_format = final_format; + want_stats = 0; + one_pass(2); + fclose(in_format->tmp_file); +} + /*** Parsing of arguments ***/ -static void usage(void) +static void NONRET usage(void) { printf("\ Usage: xsv [] []\n\ \n\ Formats:\n\ --t, --tsv TAB-separated values (default)\n\ +-t, --tsv Tab-separated values (default)\n\ -c, --csv Comma-separated values\n\ -w, --ws Values separated by arbitrary whitespace\n\ -r, --regex= Separator given by Perl regular expression (input only)\n\ + --table Format a table (output only)\n\ \n\ Format parameters:\n\ -d, --fs= Delimiter of fields\n\ +-f, --fields=,... Set field names\n\ +-h, --header The first line contains field names\n\ -q, --quiet Do not show warnings\n\ --always-quote Put quotes around all fields (CSV output only)\n\ + --table-sep= Separate table columns by spaces (default: 2)\n\ + --grid Separate table columns by grid lines\n\ +-s, --sloppy Ignore separators at the start/end of line (ws/regex only)\n\ \n\ Other options:\n\ --trim Trim leading and trailing whitespaces in fields\n\ + --equalize Pad all lines to the maximum number of fields\n\ "); exit(0); } -static void bad_args(const char *msg, ...) +static void NONRET bad_args(const char *msg, ...) { if (msg) { va_list args; @@ -385,31 +865,43 @@ static void bad_args(const char *msg, ...) exit(1); } -static const char short_options[] = "cd:qr:tw"; +static const char short_options[] = "cd:f:hqr:twW"; enum long_options { OPT_HELP = 256, + OPT_VERSION, OPT_TRIM, OPT_ALWAYS_QUOTE, + OPT_TABLE, + OPT_TABLE_SEP, + OPT_GRID, + OPT_EQUALIZE, }; static const struct option long_options[] = { { "always-quote", 0, NULL, OPT_ALWAYS_QUOTE }, { "csv", 0, NULL, 'c' }, + { "equalize", 0, NULL, OPT_EQUALIZE }, + { "fields", 1, NULL, 'f' }, { "fs", 1, NULL, 'd' }, + { "grid", 0, NULL, OPT_GRID }, + { "header", 0, NULL, 'h' }, + { "help", 0, NULL, OPT_HELP }, { "quiet", 0, NULL, 'q' }, { "regex", 1, NULL, 'r' }, + { "sloppy", 0, NULL, 's' }, + { "table", 0, NULL, OPT_TABLE }, + { "table-sep", 1, NULL, OPT_TABLE_SEP }, { "trim", 0, NULL, OPT_TRIM }, { "tsv", 0, NULL, 't' }, + { "version", 0, NULL, OPT_VERSION }, { "ws", 0, NULL, 'w' }, - { "help", 0, NULL, OPT_HELP }, { NULL, 0, NULL, 0 }, }; static void set_format(int format_id) { - struct format *f = xmalloc(sizeof(*f)); - memset(f, 0, sizeof(*f)); + struct format *f = xmalloc_zero(sizeof(*f)); f->id = format_id; switch (format_id) { @@ -434,6 +926,12 @@ static void set_format(int format_id) case FORM_REGEX: f->read_line = regex_read; break; + case FORM_TABLE: + f->write_line = table_write; + f->write_grid = table_write_grid; + f->needs_stats = 1; + f->table_sep = 2; + break; } if (!in_format) @@ -441,7 +939,7 @@ static void set_format(int format_id) else if (!out_format) out_format = f; else - bad_args("At most two format may be given."); + bad_args("At most two formats may be given."); } static struct format *current_format(void) @@ -457,7 +955,6 @@ static struct format *current_format(void) int main(int argc, char **argv) { int opt; - int want_trim = 0; const char *err; while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0) @@ -471,6 +968,12 @@ int main(int argc, char **argv) else bad_args("No field delimiter given."); break; + case 'f': + current_format()->set_field_names = optarg; + break; + case 'h': + current_format()->has_header = 1; + break; case 'q': current_format()->quiet = 1; break; @@ -480,6 +983,11 @@ int main(int argc, char **argv) if (err) bad_args("Error compiling regex: %s", err); break; + case 's': + if (current_format()->id != FORM_WS && current_format()->id != FORM_REGEX) + bad_args("--sloppy makes sense only for --ws or --regex."); + current_format()->sloppy = 1; + break; case 't': set_format(FORM_TSV); break; @@ -488,14 +996,29 @@ int main(int argc, char **argv) break; case OPT_ALWAYS_QUOTE: if (current_format()->id != FORM_CSV) - bad_args("--always-quote makes sense only for CSV."); + bad_args("--always-quote makes sense only for --csv."); current_format()->always_quote = 1; break; case OPT_HELP: usage(); + case OPT_VERSION: + puts("This is xsv version " VERSION "."); + exit(0); case OPT_TRIM: want_trim = 1; break; + case OPT_TABLE: + set_format(FORM_TABLE); + break; + case OPT_TABLE_SEP: + current_format()->table_sep = atoi(optarg); + break; + case OPT_GRID: + current_format()->table_grid = 1; + break; + case OPT_EQUALIZE: + want_equalize = 1; + break; default: bad_args(NULL); } @@ -507,6 +1030,7 @@ int main(int argc, char **argv) bad_args("Write-only format selected for input."); if (!out_format->write_line) bad_args("Read-only format selected for output."); + read_header(); for (int i = optind; i < argc; i++) { err = parse_selector(argv[i]); @@ -515,26 +1039,10 @@ int main(int argc, char **argv) } finish_parse_selectors(); - fields_init(&in_fields); - fields_init(&out_fields); - line_init(&in_line); - - for (;;) { - line_number++; - fields_reset(&in_fields); - line_reset(&in_line); - in_field = NULL; - if (!in_format->read_line()) - break; - - if (want_trim) - trim_fields(); - - fields_reset(&out_fields); - select_fields(); - - out_format->write_line(); - } - + want_stats = out_format->needs_stats | want_equalize; + if (want_stats) + two_pass(); + else + one_pass(3); return 0; }