X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=xsv.c;h=872090f02abdb73fa19675cd8e8902895a89fb3a;hb=4b57a845372de3b915f195c84311b30e13bfdfc3;hp=5bf4f01537a3d211d8575b3c3abd5b6609d42138;hpb=4df21758270aa34348eca27f096992d2a1511f48;p=xsv.git diff --git a/xsv.c b/xsv.c index 5bf4f01..872090f 100644 --- a/xsv.c +++ b/xsv.c @@ -4,23 +4,46 @@ * (c) 2012 Martin Mares */ +#define _GNU_SOURCE + #include #include #include #include #include +#include +#include #include +#ifdef __GNUC__ +#define NONRET __attribute__((noreturn)) +#define UNUSED __attribute__((unused)) +#else +#define NONRET +#define UNUSED +#endif + +/*** General functions ***/ + +static void NONRET die(char *msg, ...) +{ + va_list args; + va_start(args, msg); + fprintf(stderr, "xsv: "); + vfprintf(stderr, msg, args); + fputc('\n', stderr); + va_end(args); + exit(1); +} + /*** Memory allocation ***/ static void *xmalloc(size_t bytes) { void *p = malloc(bytes); - if (!p) { - fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes); - exit(1); - } + if (!p) + die("Out of memory (cannot allocate %zu bytes)", bytes); return p; } @@ -34,10 +57,8 @@ static void *xmalloc_zero(size_t bytes) static void *xrealloc(void *old, size_t bytes) { void *p = realloc(old, bytes); - if (!p) { - fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes); - exit(1); - } + if (!p) + die("Out of memory (cannot allocate %zu bytes)", bytes); return p; } @@ -59,6 +80,7 @@ static void *xrealloc(void *old, size_t bytes) // end DECLARE_BUF(intarray, int); +DECLARE_BUF(stringarray, char *); /*** Formats and their parameters ***/ @@ -77,9 +99,15 @@ struct format { int fs; int quote; int quiet; - int (*read_line)(void); - void (*write_line)(void); - int needs_two_passes; + int (*read_line)(struct format *fmt); + void (*write_line)(struct format *fmt); + void (*write_grid)(struct format *fmt, int pos); // -1=above, 1=below, 0=after header + int needs_stats; + + // Field names + int has_header; + char *set_field_names; + struct field_names *field_names; // CSV backend: int always_quote; @@ -93,10 +121,10 @@ struct format { // Temporary file backend: FILE *tmp_file; - intarray_t column_widths; // Table backend: int table_sep; + int table_grid; }; static struct format *in_format, *out_format; @@ -115,6 +143,34 @@ static struct field *in_field; static line_t in_line; static int line_number; +static int read_line(void) +{ + fields_reset(&in_fields); + line_reset(&in_line); + in_field = NULL; + if (!in_format->read_line(in_format)) + return 0; + if (ferror_unlocked(stdin)) + die("I/O error when reading standard input"); + return 1; +} + +static void write_line(void) +{ + out_format->write_line(out_format); + if (ferror_unlocked(stdout)) + die("I/O error when writing standard input"); +} + +static void write_grid(int pos) +{ + if (out_format->write_grid) { + out_format->write_grid(out_format, pos); + if (ferror_unlocked(stdout)) + die("I/O error when writing standard input"); + } +} + static void new_field(int pos) { in_field = fields_push(&in_fields); @@ -128,13 +184,21 @@ static void ensure_field(int pos) new_field(pos); } +// FIXME: Use elsewhere +static unsigned char *get_field(fields_t *fields, int i, int *len) +{ + struct field *f = fields_nth(fields, i); + *len = f->len; + return line_nth(&in_line, f->start_pos); +} + static void warn(struct format *fmt, char *msg, ...) { if (!fmt->quiet) { fprintf(stderr, "Warning at line %d: ", line_number); va_list args; va_start(args, msg); - vfprintf(stderr, args, msg); + vfprintf(stderr, msg, args); va_end(args); fputc('\n', stderr); } @@ -143,7 +207,7 @@ static void warn(struct format *fmt, char *msg, ...) static int next_line(void) { for (;;) { - int c = getchar(); + int c = getchar_unlocked(); if (c == '\r') continue; if (c < 0) @@ -154,39 +218,76 @@ static int next_line(void) } } +static int field_chars(struct field *f) +{ + unsigned char *s = line_nth(&in_line, f->start_pos); + int i = 0; + mbstate_t mbs; + memset(&mbs, 0, sizeof(mbs)); + + int chars = 0; + while (i < f->len) { + size_t k = mbrlen((char *) s + i, f->len - i, &mbs); + if ((int) k <= 0) + break; + i += k; + chars++; + } + + return chars; +} + +/*** Field statistics ***/ + +static intarray_t column_widths; + +static void update_stats(void) +{ + for (int i = 0; i < fields_count(&out_fields); i++) { + struct field *f = fields_nth(&out_fields, i); + intarray_t *w = &column_widths; + + while (i >= intarray_count(w)) + *intarray_push(w) = 0; + int fw = field_chars(f); + if (*intarray_nth(w, i) < fw) + *intarray_nth(w, i) = fw; + } +} + /*** CSV/TSV back-end */ -static int csv_read(void) +static int csv_read(struct format *fmt) { int quoted = 0; for (;;) { - int c = getchar(); + int c = getchar_unlocked(); int i = line_count(&in_line); restart: if (c == '\r') continue; if (c < 0 || c == '\n') { if (quoted) - warn(in_format, "Missing closing quote."); + warn(fmt, "Missing closing quote."); if (c < 0) return !!fields_count(&in_fields); else return 1; } if (quoted) { - if (c == in_format->quote) { - c = getchar(); - if (c != in_format->quote) { + if (c == fmt->quote) { + c = getchar_unlocked(); + if (c != fmt->quote) { quoted = 0; goto restart; } // Two quotes assimilate to one } // Fall through to pushing the character - } else if (c == in_format->quote) { + } else if (c == fmt->quote) { quoted = 1; continue; - } else if (c == in_format->fs && !quoted) { + } else if (c == fmt->fs && !quoted) { ensure_field(i); new_field(i); continue; @@ -202,42 +303,42 @@ static int is_ws(int c) return (c == ' ' || c == '\t' || c == '\f'); } -static void csv_write(void) +static void csv_write(struct format *fmt) { unsigned char *line = line_first(&in_line); int n = fields_count(&out_fields); for (int i=0; iquote >= 0) { - need_quotes = out_format->always_quote; + if (fmt->quote >= 0) { + need_quotes = fmt->always_quote; for (int j=0; !need_quotes && j < f->len; j++) { int c = line[f->start_pos + j]; - if (c == out_format->fs || c == out_format->quote) + if (c == fmt->fs || c == fmt->quote) need_quotes = 1; } } if (i) - putchar(out_format->fs); + putchar_unlocked(fmt->fs); if (need_quotes) - putchar(out_format->quote); + putchar_unlocked(fmt->quote); for (int j=0; j < f->len; j++) { int c = line[f->start_pos + j]; - if (c == out_format->fs && !need_quotes) - warn(out_format, "Field separator found inside field and quoting is turned off."); - if (c == out_format->quote) - putchar(c); - putchar(c); + if (c == fmt->fs && !need_quotes) + warn(fmt, "Field separator found inside field and quoting is turned off."); + if (c == fmt->quote) + putchar_unlocked(c); + putchar_unlocked(c); } if (need_quotes) - putchar(out_format->quote); + putchar_unlocked(fmt->quote); } - putchar('\n'); + putchar_unlocked('\n'); } /*** White-space back-end ***/ -static int ws_read(void) +static int ws_read(struct format *fmt) { if (!next_line()) return 0; @@ -257,7 +358,7 @@ static int ws_read(void) if (ws) { if (!in_field->start_pos && !in_field->len && - !in_format->strict_ws) + !fmt->strict_ws) in_field->start_pos = i; else new_field(i); @@ -267,7 +368,7 @@ static int ws_read(void) } } - if (ws && in_format->strict_ws) + if (ws && fmt->strict_ws) new_field(n); return 1; } @@ -289,7 +390,7 @@ static const char *regex_set(struct format *f, char *rx) return NULL; } -static int regex_read(void) +static int regex_read(struct format *fmt) { if (!next_line()) return 0; @@ -302,10 +403,10 @@ static int regex_read(void) int i = 0; for (;;) { int ovec[3]; - int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3); + int sep = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3); if (sep < 0) { if (sep != PCRE_ERROR_NOMATCH) - warn(in_format, "PCRE matching error %d", sep); + warn(fmt, "PCRE matching error %d", sep); // No further occurrence of the separator: the rest is a single field new_field(i); in_field->len = n - i; @@ -319,89 +420,115 @@ static int regex_read(void) /*** Table back-end ***/ -static void table_write(void) +static void table_write(struct format *fmt) { - for (int i = 0; i < fields_count(&in_fields); i++) { - if (i) - printf("%*s", out_format->table_sep, ""); - struct field *f = fields_nth(&in_fields, i); - int w = *intarray_nth(&in_format->column_widths, i); - if (f->len > w) { - warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", f->len, w); - w = f->len; - } - int j = 0; - unsigned char *p = line_nth(&in_line, f->start_pos); - while (j < f->len) { - putchar(*p++); - j++; + for (int i = 0; i < intarray_count(&column_widths); i++) { + if (fmt->table_grid) { + putchar_unlocked('|'); + printf("%*s", fmt->table_sep / 2, ""); + } else if (i) + printf("%*s", fmt->table_sep, ""); + + int cw = *intarray_nth(&column_widths, i); + int fw = 0; + if (i < fields_count(&out_fields)) { + int len; + unsigned char *p = get_field(&out_fields, i, &len); + fw = field_chars(fields_nth(&out_fields, i)); + if (fw > cw) { + warn(fmt, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw); + cw = fw; + } + while (len--) + putchar(*p++); } - while (j < w) { - putchar(' '); - j++; + while (fw < cw) { + putchar_unlocked(' '); + fw++; } + + if (fmt->table_grid) + printf("%*s", fmt->table_sep - fmt->table_sep / 2, ""); } - putchar('\n'); + + if (fmt->table_grid) + putchar_unlocked('|'); + putchar_unlocked('\n'); +} + +static void table_write_grid(struct format *fmt, int pos UNUSED) +{ + if (!fmt->table_grid) + return; + + for (int i = 0; i < intarray_count(&column_widths); i++) { + putchar_unlocked('+'); + int w = fmt->table_sep + *intarray_nth(&column_widths, i); // FIXME: Avoid the * + while (w--) + putchar('-'); + } + putchar_unlocked('+'); + putchar_unlocked('\n'); } /*** Temporary file back-end ***/ -static int tmp_read(void) +static int tmp_read(struct format *fmt) { - FILE *tf = in_format->tmp_file; + FILE *tf = fmt->tmp_file; for (;;) { - int c = fgetc(tf); + int c = getc_unlocked(tf); if (c < 0) return 0; if (c == 0xff) return 1; if (c == 0xfe) { - c = fgetc(tf); - c = (c << 8) | fgetc(tf); - c = (c << 8) | fgetc(tf); - c = (c << 8) | fgetc(tf); + c = getc_unlocked(tf); + c = (c << 8) | getc_unlocked(tf); + c = (c << 8) | getc_unlocked(tf); + c = (c << 8) | getc_unlocked(tf); } new_field(line_count(&in_line)); in_field->len = c; while (c--) { - int x = fgetc(tf); + int x = getc_unlocked(tf); if (x < 0) { - warn(in_format, "Truncated temporary file"); + warn(fmt, "Truncated temporary file"); return 0; } *line_push(&in_line) = x; } } + + if (ferror_unlocked(tf)) + die("I/O error when reading temporary file"); } -static void tmp_write(void) +static void tmp_write(struct format *fmt) { - FILE *tf = out_format->tmp_file; + FILE *tf = fmt->tmp_file; - for (int i = 0; i < fields_count(&in_fields); i++) { - struct field *f = fields_nth(&in_fields, i); + for (int i = 0; i < fields_count(&out_fields); i++) { + struct field *f = fields_nth(&out_fields, i); if (f->len < 0xfe) - fputc(f->len, tf); + putc_unlocked(f->len, tf); else { - fputc(0xfe, tf); - fputc((f->len >> 24) & 0xff, tf); - fputc((f->len >> 16) & 0xff, tf); - fputc((f->len >> 8) & 0xff, tf); - fputc(f->len & 0xff, tf); + putc_unlocked(0xfe, tf); + putc_unlocked((f->len >> 24) & 0xff, tf); + putc_unlocked((f->len >> 16) & 0xff, tf); + putc_unlocked((f->len >> 8) & 0xff, tf); + putc_unlocked(f->len & 0xff, tf); } unsigned char *p = line_nth(&in_line, f->start_pos); for (int j = 0; j < f->len; j++) - fputc(*p++, tf); - - intarray_t *w = &out_format->column_widths; - while (i >= intarray_count(w)) - *intarray_push(w) = 0; - if (*intarray_nth(w, i) < f->len) - *intarray_nth(w, i) = f->len; + putc_unlocked(*p++, tf); } - fputc(0xff, tf); + putc_unlocked(0xff, tf); + + if (ferror_unlocked(tf)) + die("I/O error when writing temporary file"); } /*** Transforms ***/ @@ -418,15 +545,146 @@ static void trim_fields(void) } } +/*** Field names and headers ***/ + +struct field_names { + stringarray_t names; +}; + +static void add_field(struct field_names *fn, char *name, int namelen) +{ + char *n = xmalloc(namelen + 1); + memcpy(n, name, namelen); + n[namelen] = 0; + *stringarray_push(&fn->names) = n; +} + +static void add_field_names(struct field_names *fn, char *names) +{ + char *p = names; + while (p) { + char *q = strchr(p, ','); + int len = q ? q-p : (int) strlen(p); + add_field(fn, p, len); + p = q ? q+1 : NULL; + } +} + +static void read_header(void) +{ + if (!(in_format->has_header || in_format->set_field_names)) + return; + + struct field_names *fn = xmalloc_zero(sizeof(*fn)); + in_format->field_names = fn; + + if (in_format->has_header) { + if (!read_line()) + die("Missing input header"); + } + + if (in_format->set_field_names) { + add_field_names(fn, in_format->set_field_names); + } else { + for (int i = 0; i < fields_count(&in_fields); i++) { + int len; + char *s = (char *) get_field(&in_fields, i, &len); + add_field(fn, s, len); + } + } +} + +static void write_header(void) +{ + if (!out_format->has_header) { + write_grid(-1); + return; + } + + if (out_format->set_field_names) { + struct field_names *fn = xmalloc_zero(sizeof(*fn)); + out_format->field_names = fn; + add_field_names(fn, out_format->set_field_names); + } else if (in_format->field_names) + out_format->field_names = in_format->field_names; + else + die("Output header requested, but no field names specified"); + + line_reset(&in_line); + fields_reset(&out_fields); + struct field_names *fn = out_format->field_names; + for (int i = 0; i < stringarray_count(&fn->names); i++) { + struct field *f = fields_push(&out_fields); + f->start_pos = line_count(&in_line); + f->len = 0; + char *s = *stringarray_nth(&fn->names, i); + while (*s) { + *line_push(&in_line) = *s++; + f->len++; + } + } + + // This is tricky: when we are formatting a table, field names are normally + // calculated in pass 1, but the header is written in pass 2, so we have to + // update column statistics, because field name can be too wide to fit. + update_stats(); + write_grid(-1); + write_line(); + write_grid(0); +} + +static void write_footer(void) +{ + write_grid(1); +} + +static int find_field_by_name(struct field_names *fn, char *name) +{ + for (int i = 0; i < stringarray_count(&fn->names); i++) + if (!strcmp(*stringarray_nth(&fn->names, i), name)) + return i + 1; + return -1; +} + /*** Field selection ***/ struct selector { - int first_field, last_field; + int first_field, last_field; // 0 means "boundary" }; DECLARE_BUF(selectors, struct selector); static selectors_t selectors; +static int parse_field_num(char *str) +{ + int f = 0; + + while (*str) { + if (*str < '0' || *str > '9') + return -1; + if (f >= 100000000) + return -1; + f = 10*f + *str - '0'; + str++; + } + return f; +} + +static int parse_field(char *str) +{ + if (!*str) + return 0; + + int f = parse_field_num(str); + if (f > 0) + return f; + + if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0) + return f; + + die("Unknown field %s", str); +} + static char *parse_selector(char *str) { char buf[strlen(str) + 1]; @@ -436,10 +694,10 @@ static char *parse_selector(char *str) char *sep = strchr(buf, '-'); if (sep) { *sep++ = 0; - s->first_field = atoi(buf); - s->last_field = atoi(sep); + s->first_field = parse_field(buf); + s->last_field = parse_field(sep); } else - s->first_field = s->last_field = atoi(buf); + s->first_field = s->last_field = parse_field(buf); return NULL; } @@ -470,53 +728,73 @@ static void select_fields(void) } } +static void select_all_fields(void) +{ + for (int i = 0; i < fields_count(&in_fields); i++) + *fields_push(&out_fields) = *fields_nth(&in_fields, i); +} + /*** Processing of files ***/ -static void one_pass(void) +static void one_pass(int pass) { - line_number = 0; + if (pass & 2) + write_header(); + for (;;) { line_number++; - fields_reset(&in_fields); - line_reset(&in_line); - in_field = NULL; - if (!in_format->read_line()) + if (!read_line()) break; - if (want_trim) + if (want_trim && (pass & 1)) trim_fields(); fields_reset(&out_fields); - select_fields(); + if (pass & 1) + select_fields(); + else + select_all_fields(); + + if (out_format->needs_stats) + update_stats(); - out_format->write_line(); + write_line(); } + + if (pass & 2) + write_footer(); } static void two_pass(void) { struct format *final_format = out_format; + // We need to use character set info from the current locale + setlocale(LC_CTYPE, ""); + // Pass 1: Set up writer of intermediate format out_format = xmalloc_zero(sizeof(*out_format)); out_format->id = FORM_TMP; out_format->read_line = tmp_read; out_format->write_line = tmp_write; out_format->tmp_file = tmpfile(); - intarray_init(&out_format->column_widths); - one_pass(); + out_format->needs_stats = final_format->needs_stats; + out_format->field_names = in_format->field_names; + one_pass(1); // Pass 2: Set up reader of intermediate format in_format = out_format; rewind(in_format->tmp_file); + line_number = 0; out_format = final_format; - one_pass(); + out_format->needs_stats = 0; + one_pass(2); fclose(in_format->tmp_file); } /*** Parsing of arguments ***/ -static void usage(void) +static void NONRET usage(void) { printf("\ Usage: xsv [] []\n\ @@ -531,9 +809,12 @@ Formats:\n\ \n\ Format parameters:\n\ -d, --fs= Delimiter of fields\n\ +-f, --fields=,... Set field names\n\ +-h, --header The first line contains field names\n\ -q, --quiet Do not show warnings\n\ --always-quote Put quotes around all fields (CSV output only)\n\ --table-sep= Separate table columns by spaces (default: 2)\n\ + --grid Separate table columns by grid lines\n\ \n\ Other options:\n\ --trim Trim leading and trailing whitespaces in fields\n\ @@ -541,7 +822,7 @@ Other options:\n\ exit(0); } -static void bad_args(const char *msg, ...) +static void NONRET bad_args(const char *msg, ...) { if (msg) { va_list args; @@ -555,7 +836,7 @@ static void bad_args(const char *msg, ...) exit(1); } -static const char short_options[] = "cd:qr:twW"; +static const char short_options[] = "cd:f:hqr:twW"; enum long_options { OPT_HELP = 256, @@ -563,12 +844,16 @@ enum long_options { OPT_ALWAYS_QUOTE, OPT_TABLE, OPT_TABLE_SEP, + OPT_GRID, }; static const struct option long_options[] = { { "always-quote", 0, NULL, OPT_ALWAYS_QUOTE }, { "csv", 0, NULL, 'c' }, + { "fields", 1, NULL, 'f' }, { "fs", 1, NULL, 'd' }, + { "grid", 0, NULL, OPT_GRID }, + { "header", 0, NULL, 'h' }, { "quiet", 0, NULL, 'q' }, { "regex", 1, NULL, 'r' }, { "strict-ws", 0, NULL, 'W' }, @@ -610,7 +895,8 @@ static void set_format(int format_id) break; case FORM_TABLE: f->write_line = table_write; - f->needs_two_passes = 1; + f->write_grid = table_write_grid; + f->needs_stats = 1; f->table_sep = 2; break; } @@ -649,6 +935,12 @@ int main(int argc, char **argv) else bad_args("No field delimiter given."); break; + case 'f': + current_format()->set_field_names = optarg; + break; + case 'h': + current_format()->has_header = 1; + break; case 'q': current_format()->quiet = 1; break; @@ -684,6 +976,9 @@ int main(int argc, char **argv) case OPT_TABLE_SEP: current_format()->table_sep = atoi(optarg); break; + case OPT_GRID: + current_format()->table_grid = 1; + break; default: bad_args(NULL); } @@ -695,6 +990,7 @@ int main(int argc, char **argv) bad_args("Write-only format selected for input."); if (!out_format->write_line) bad_args("Read-only format selected for output."); + read_header(); for (int i = optind; i < argc; i++) { err = parse_selector(argv[i]); @@ -703,13 +999,9 @@ int main(int argc, char **argv) } finish_parse_selectors(); - fields_init(&in_fields); - fields_init(&out_fields); - line_init(&in_line); - - if (out_format->needs_two_passes) + if (out_format->needs_stats) two_pass(); else - one_pass(); + one_pass(3); return 0; }