X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=xsv.c;h=0d2d972b055544ddab45ad06132ed09ea4e2b8c6;hb=b21be4364c6b246bb13f0e448d7969be8651e248;hp=872090f02abdb73fa19675cd8e8902895a89fb3a;hpb=4b57a845372de3b915f195c84311b30e13bfdfc3;p=xsv.git diff --git a/xsv.c b/xsv.c index 872090f..0d2d972 100644 --- a/xsv.c +++ b/xsv.c @@ -1,11 +1,9 @@ /* - * A Swiss-Army Knife for CSV-like Files + * The Swiss-Army Knife for CSV-like Files * * (c) 2012 Martin Mares */ -#define _GNU_SOURCE - #include #include #include @@ -24,6 +22,9 @@ #define UNUSED #endif +static void select_fields(void); +static void select_all_fields(void); + /*** General functions ***/ static void NONRET die(char *msg, ...) @@ -99,6 +100,7 @@ struct format { int fs; int quote; int quiet; + int sloppy; int (*read_line)(struct format *fmt); void (*write_line)(struct format *fmt); void (*write_grid)(struct format *fmt, int pos); // -1=above, 1=below, 0=after header @@ -112,9 +114,6 @@ struct format { // CSV backend: int always_quote; - // WS backend: - int strict_ws; - // regex backend: pcre *pcre; pcre_extra *pcre_extra; @@ -128,7 +127,7 @@ struct format { }; static struct format *in_format, *out_format; -static int want_trim; +static int want_trim, want_equalize, want_stats; struct field { int start_pos; @@ -184,7 +183,6 @@ static void ensure_field(int pos) new_field(pos); } -// FIXME: Use elsewhere static unsigned char *get_field(fields_t *fields, int i, int *len) { struct field *f = fields_nth(fields, i); @@ -243,6 +241,9 @@ static intarray_t column_widths; static void update_stats(void) { + if (!want_stats) + return; + for (int i = 0; i < fields_count(&out_fields); i++) { struct field *f = fields_nth(&out_fields, i); intarray_t *w = &column_widths; @@ -305,16 +306,15 @@ static int is_ws(int c) static void csv_write(struct format *fmt) { - unsigned char *line = line_first(&in_line); - int n = fields_count(&out_fields); - for (int i=0; iquote >= 0) { need_quotes = fmt->always_quote; - for (int j=0; !need_quotes && j < f->len; j++) { - int c = line[f->start_pos + j]; - if (c == fmt->fs || c == fmt->quote) + for (int j=0; !need_quotes && j < len; j++) { + if (p[j] == fmt->fs || p[j] == fmt->quote) need_quotes = 1; } } @@ -322,8 +322,8 @@ static void csv_write(struct format *fmt) putchar_unlocked(fmt->fs); if (need_quotes) putchar_unlocked(fmt->quote); - for (int j=0; j < f->len; j++) { - int c = line[f->start_pos + j]; + for (int j=0; j < len; j++) { + int c = p[j]; if (c == fmt->fs && !need_quotes) warn(fmt, "Field separator found inside field and quoting is turned off."); if (c == fmt->quote) @@ -358,7 +358,7 @@ static int ws_read(struct format *fmt) if (ws) { if (!in_field->start_pos && !in_field->len && - !fmt->strict_ws) + fmt->sloppy) in_field->start_pos = i; else new_field(i); @@ -368,7 +368,7 @@ static int ws_read(struct format *fmt) } } - if (ws && fmt->strict_ws) + if (ws && !fmt->sloppy) new_field(n); return 1; } @@ -403,17 +403,27 @@ static int regex_read(struct format *fmt) int i = 0; for (;;) { int ovec[3]; - int sep = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3); - if (sep < 0) { - if (sep != PCRE_ERROR_NOMATCH) - warn(fmt, "PCRE matching error %d", sep); + int err = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3); + if (err < 0) { + if (err != PCRE_ERROR_NOMATCH) + warn(fmt, "PCRE matching error %d", err); // No further occurrence of the separator: the rest is a single field + if (!fmt->sloppy || i < n) { + new_field(i); + in_field->len = n - i; + } + return 1; + } + if (ovec[0] == ovec[1]) { + warn(fmt, "Regular expression matched an empty separator."); new_field(i); in_field->len = n - i; return 1; } - new_field(i); - in_field->len = ovec[0] - i; + if (!fmt->sloppy || ovec[0]) { + new_field(i); + in_field->len = ovec[0] - i; + } i = ovec[1]; } } @@ -436,11 +446,11 @@ static void table_write(struct format *fmt) unsigned char *p = get_field(&out_fields, i, &len); fw = field_chars(fields_nth(&out_fields, i)); if (fw > cw) { - warn(fmt, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw); + warn(fmt, "Internal error: Wrongly calculated width of column %d (%d > %d)", i, fw, cw); cw = fw; } while (len--) - putchar(*p++); + putchar_unlocked(*p++); } while (fw < cw) { putchar_unlocked(' '); @@ -463,9 +473,9 @@ static void table_write_grid(struct format *fmt, int pos UNUSED) for (int i = 0; i < intarray_count(&column_widths); i++) { putchar_unlocked('+'); - int w = fmt->table_sep + *intarray_nth(&column_widths, i); // FIXME: Avoid the * + int w = fmt->table_sep + *intarray_nth(&column_widths, i); while (w--) - putchar('-'); + putchar_unlocked('-'); } putchar_unlocked('+'); putchar_unlocked('\n'); @@ -493,10 +503,8 @@ static int tmp_read(struct format *fmt) in_field->len = c; while (c--) { int x = getc_unlocked(tf); - if (x < 0) { - warn(fmt, "Truncated temporary file"); - return 0; - } + if (x < 0) + die("Truncated temporary file"); *line_push(&in_line) = x; } } @@ -510,19 +518,20 @@ static void tmp_write(struct format *fmt) FILE *tf = fmt->tmp_file; for (int i = 0; i < fields_count(&out_fields); i++) { - struct field *f = fields_nth(&out_fields, i); - if (f->len < 0xfe) - putc_unlocked(f->len, tf); + int len; + unsigned char *p = get_field(&out_fields, i, &len); + + if (len < 0xfe) + putc_unlocked(len, tf); else { putc_unlocked(0xfe, tf); - putc_unlocked((f->len >> 24) & 0xff, tf); - putc_unlocked((f->len >> 16) & 0xff, tf); - putc_unlocked((f->len >> 8) & 0xff, tf); - putc_unlocked(f->len & 0xff, tf); + putc_unlocked((len >> 24) & 0xff, tf); + putc_unlocked((len >> 16) & 0xff, tf); + putc_unlocked((len >> 8) & 0xff, tf); + putc_unlocked(len & 0xff, tf); } - unsigned char *p = line_nth(&in_line, f->start_pos); - for (int j = 0; j < f->len; j++) + while (len--) putc_unlocked(*p++, tf); } putc_unlocked(0xff, tf); @@ -545,6 +554,14 @@ static void trim_fields(void) } } +static void equalize_fields(void) +{ + while (fields_count(&out_fields) < intarray_count(&column_widths)) { + struct field *f = fields_push(&out_fields); + f->start_pos = f->len = 0; + } +} + /*** Field names and headers ***/ struct field_names { @@ -601,20 +618,22 @@ static void write_header(void) return; } + int want_select_fields = 0; if (out_format->set_field_names) { struct field_names *fn = xmalloc_zero(sizeof(*fn)); out_format->field_names = fn; add_field_names(fn, out_format->set_field_names); - } else if (in_format->field_names) + } else if (in_format->field_names) { out_format->field_names = in_format->field_names; - else + want_select_fields = 1; + } else die("Output header requested, but no field names specified"); line_reset(&in_line); - fields_reset(&out_fields); + fields_reset(&in_fields); struct field_names *fn = out_format->field_names; for (int i = 0; i < stringarray_count(&fn->names); i++) { - struct field *f = fields_push(&out_fields); + struct field *f = fields_push(&in_fields); f->start_pos = line_count(&in_line); f->len = 0; char *s = *stringarray_nth(&fn->names, i); @@ -624,10 +643,20 @@ static void write_header(void) } } + fields_reset(&out_fields); + if (want_select_fields) + select_fields(); + else + select_all_fields(); + // This is tricky: when we are formatting a table, field names are normally // calculated in pass 1, but the header is written in pass 2, so we have to // update column statistics, because field name can be too wide to fit. + want_stats++; update_stats(); + want_stats--; + if (want_equalize) + equalize_fields(); write_grid(-1); write_line(); write_grid(0); @@ -682,7 +711,7 @@ static int parse_field(char *str) if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0) return f; - die("Unknown field %s", str); + die("Unknown field `%s'", str); } static char *parse_selector(char *str) @@ -755,9 +784,9 @@ static void one_pass(int pass) else select_all_fields(); - if (out_format->needs_stats) - update_stats(); - + if (want_equalize && (pass & 2)) + equalize_fields(); + update_stats(); write_line(); } @@ -778,7 +807,6 @@ static void two_pass(void) out_format->read_line = tmp_read; out_format->write_line = tmp_write; out_format->tmp_file = tmpfile(); - out_format->needs_stats = final_format->needs_stats; out_format->field_names = in_format->field_names; one_pass(1); @@ -787,7 +815,7 @@ static void two_pass(void) rewind(in_format->tmp_file); line_number = 0; out_format = final_format; - out_format->needs_stats = 0; + want_stats = 0; one_pass(2); fclose(in_format->tmp_file); } @@ -800,10 +828,9 @@ static void NONRET usage(void) Usage: xsv [] []\n\ \n\ Formats:\n\ --t, --tsv TAB-separated values (default)\n\ +-t, --tsv Tab-separated values (default)\n\ -c, --csv Comma-separated values\n\ -w, --ws Values separated by arbitrary whitespace\n\ --W, --strict-ws Like --ws, but recognize empty columns at start/end\n\ -r, --regex= Separator given by Perl regular expression (input only)\n\ --table Format a table (output only)\n\ \n\ @@ -815,9 +842,11 @@ Format parameters:\n\ --always-quote Put quotes around all fields (CSV output only)\n\ --table-sep= Separate table columns by spaces (default: 2)\n\ --grid Separate table columns by grid lines\n\ +-s, --sloppy Ignore separators at the start/end of line (ws/regex only)\n\ \n\ Other options:\n\ --trim Trim leading and trailing whitespaces in fields\n\ + --equalize Pad all lines to the maximum number of fields\n\ "); exit(0); } @@ -840,29 +869,33 @@ static const char short_options[] = "cd:f:hqr:twW"; enum long_options { OPT_HELP = 256, + OPT_VERSION, OPT_TRIM, OPT_ALWAYS_QUOTE, OPT_TABLE, OPT_TABLE_SEP, OPT_GRID, + OPT_EQUALIZE, }; static const struct option long_options[] = { { "always-quote", 0, NULL, OPT_ALWAYS_QUOTE }, { "csv", 0, NULL, 'c' }, + { "equalize", 0, NULL, OPT_EQUALIZE }, { "fields", 1, NULL, 'f' }, { "fs", 1, NULL, 'd' }, { "grid", 0, NULL, OPT_GRID }, { "header", 0, NULL, 'h' }, + { "help", 0, NULL, OPT_HELP }, { "quiet", 0, NULL, 'q' }, { "regex", 1, NULL, 'r' }, - { "strict-ws", 0, NULL, 'W' }, + { "sloppy", 0, NULL, 's' }, { "table", 0, NULL, OPT_TABLE }, { "table-sep", 1, NULL, OPT_TABLE_SEP }, { "trim", 0, NULL, OPT_TRIM }, { "tsv", 0, NULL, 't' }, + { "version", 0, NULL, OPT_VERSION }, { "ws", 0, NULL, 'w' }, - { "help", 0, NULL, OPT_HELP }, { NULL, 0, NULL, 0 }, }; @@ -950,23 +983,27 @@ int main(int argc, char **argv) if (err) bad_args("Error compiling regex: %s", err); break; + case 's': + if (current_format()->id != FORM_WS && current_format()->id != FORM_REGEX) + bad_args("--sloppy makes sense only for --ws or --regex."); + current_format()->sloppy = 1; + break; case 't': set_format(FORM_TSV); break; case 'w': set_format(FORM_WS); break; - case 'W': - set_format(FORM_WS); - current_format()->strict_ws = 1; - break; case OPT_ALWAYS_QUOTE: if (current_format()->id != FORM_CSV) - bad_args("--always-quote makes sense only for CSV."); + bad_args("--always-quote makes sense only for --csv."); current_format()->always_quote = 1; break; case OPT_HELP: usage(); + case OPT_VERSION: + puts("This is xsv version " VERSION "."); + exit(0); case OPT_TRIM: want_trim = 1; break; @@ -979,6 +1016,9 @@ int main(int argc, char **argv) case OPT_GRID: current_format()->table_grid = 1; break; + case OPT_EQUALIZE: + want_equalize = 1; + break; default: bad_args(NULL); } @@ -999,7 +1039,8 @@ int main(int argc, char **argv) } finish_parse_selectors(); - if (out_format->needs_stats) + want_stats = out_format->needs_stats | want_equalize; + if (want_stats) two_pass(); else one_pass(3);