X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=xsv.c;h=0d2d972b055544ddab45ad06132ed09ea4e2b8c6;hb=b21be4364c6b246bb13f0e448d7969be8651e248;hp=b0e91246cbd2aa1eb0f4f18ede9330a160c88f87;hpb=e0be90cb432c569d2aac3bba0b9e9de460e39bf7;p=xsv.git diff --git a/xsv.c b/xsv.c index b0e9124..0d2d972 100644 --- a/xsv.c +++ b/xsv.c @@ -1,11 +1,9 @@ /* - * A Swiss-Army Knife for CSV-like Files + * The Swiss-Army Knife for CSV-like Files * * (c) 2012 Martin Mares */ -#define _GNU_SOURCE - #include #include #include @@ -24,6 +22,9 @@ #define UNUSED #endif +static void select_fields(void); +static void select_all_fields(void); + /*** General functions ***/ static void NONRET die(char *msg, ...) @@ -99,6 +100,7 @@ struct format { int fs; int quote; int quiet; + int sloppy; int (*read_line)(struct format *fmt); void (*write_line)(struct format *fmt); void (*write_grid)(struct format *fmt, int pos); // -1=above, 1=below, 0=after header @@ -112,9 +114,6 @@ struct format { // CSV backend: int always_quote; - // WS backend: - int strict_ws; - // regex backend: pcre *pcre; pcre_extra *pcre_extra; @@ -359,7 +358,7 @@ static int ws_read(struct format *fmt) if (ws) { if (!in_field->start_pos && !in_field->len && - !fmt->strict_ws) + fmt->sloppy) in_field->start_pos = i; else new_field(i); @@ -369,7 +368,7 @@ static int ws_read(struct format *fmt) } } - if (ws && fmt->strict_ws) + if (ws && !fmt->sloppy) new_field(n); return 1; } @@ -404,17 +403,27 @@ static int regex_read(struct format *fmt) int i = 0; for (;;) { int ovec[3]; - int sep = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3); - if (sep < 0) { - if (sep != PCRE_ERROR_NOMATCH) - warn(fmt, "PCRE matching error %d", sep); + int err = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3); + if (err < 0) { + if (err != PCRE_ERROR_NOMATCH) + warn(fmt, "PCRE matching error %d", err); // No further occurrence of the separator: the rest is a single field + if (!fmt->sloppy || i < n) { + new_field(i); + in_field->len = n - i; + } + return 1; + } + if (ovec[0] == ovec[1]) { + warn(fmt, "Regular expression matched an empty separator."); new_field(i); in_field->len = n - i; return 1; } - new_field(i); - in_field->len = ovec[0] - i; + if (!fmt->sloppy || ovec[0]) { + new_field(i); + in_field->len = ovec[0] - i; + } i = ovec[1]; } } @@ -441,7 +450,7 @@ static void table_write(struct format *fmt) cw = fw; } while (len--) - putchar(*p++); + putchar_unlocked(*p++); } while (fw < cw) { putchar_unlocked(' '); @@ -466,7 +475,7 @@ static void table_write_grid(struct format *fmt, int pos UNUSED) putchar_unlocked('+'); int w = fmt->table_sep + *intarray_nth(&column_widths, i); while (w--) - putchar('-'); + putchar_unlocked('-'); } putchar_unlocked('+'); putchar_unlocked('\n'); @@ -494,10 +503,8 @@ static int tmp_read(struct format *fmt) in_field->len = c; while (c--) { int x = getc_unlocked(tf); - if (x < 0) { - warn(fmt, "Truncated temporary file"); - return 0; - } + if (x < 0) + die("Truncated temporary file"); *line_push(&in_line) = x; } } @@ -611,20 +618,22 @@ static void write_header(void) return; } + int want_select_fields = 0; if (out_format->set_field_names) { struct field_names *fn = xmalloc_zero(sizeof(*fn)); out_format->field_names = fn; add_field_names(fn, out_format->set_field_names); - } else if (in_format->field_names) + } else if (in_format->field_names) { out_format->field_names = in_format->field_names; - else + want_select_fields = 1; + } else die("Output header requested, but no field names specified"); line_reset(&in_line); - fields_reset(&out_fields); + fields_reset(&in_fields); struct field_names *fn = out_format->field_names; for (int i = 0; i < stringarray_count(&fn->names); i++) { - struct field *f = fields_push(&out_fields); + struct field *f = fields_push(&in_fields); f->start_pos = line_count(&in_line); f->len = 0; char *s = *stringarray_nth(&fn->names, i); @@ -634,6 +643,12 @@ static void write_header(void) } } + fields_reset(&out_fields); + if (want_select_fields) + select_fields(); + else + select_all_fields(); + // This is tricky: when we are formatting a table, field names are normally // calculated in pass 1, but the header is written in pass 2, so we have to // update column statistics, because field name can be too wide to fit. @@ -696,7 +711,7 @@ static int parse_field(char *str) if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0) return f; - die("Unknown field %s", str); + die("Unknown field `%s'", str); } static char *parse_selector(char *str) @@ -813,10 +828,9 @@ static void NONRET usage(void) Usage: xsv [] []\n\ \n\ Formats:\n\ --t, --tsv TAB-separated values (default)\n\ +-t, --tsv Tab-separated values (default)\n\ -c, --csv Comma-separated values\n\ -w, --ws Values separated by arbitrary whitespace\n\ --W, --strict-ws Like --ws, but recognize empty columns at start/end\n\ -r, --regex= Separator given by Perl regular expression (input only)\n\ --table Format a table (output only)\n\ \n\ @@ -828,6 +842,7 @@ Format parameters:\n\ --always-quote Put quotes around all fields (CSV output only)\n\ --table-sep= Separate table columns by spaces (default: 2)\n\ --grid Separate table columns by grid lines\n\ +-s, --sloppy Ignore separators at the start/end of line (ws/regex only)\n\ \n\ Other options:\n\ --trim Trim leading and trailing whitespaces in fields\n\ @@ -854,6 +869,7 @@ static const char short_options[] = "cd:f:hqr:twW"; enum long_options { OPT_HELP = 256, + OPT_VERSION, OPT_TRIM, OPT_ALWAYS_QUOTE, OPT_TABLE, @@ -870,15 +886,16 @@ static const struct option long_options[] = { { "fs", 1, NULL, 'd' }, { "grid", 0, NULL, OPT_GRID }, { "header", 0, NULL, 'h' }, + { "help", 0, NULL, OPT_HELP }, { "quiet", 0, NULL, 'q' }, { "regex", 1, NULL, 'r' }, - { "strict-ws", 0, NULL, 'W' }, + { "sloppy", 0, NULL, 's' }, { "table", 0, NULL, OPT_TABLE }, { "table-sep", 1, NULL, OPT_TABLE_SEP }, { "trim", 0, NULL, OPT_TRIM }, { "tsv", 0, NULL, 't' }, + { "version", 0, NULL, OPT_VERSION }, { "ws", 0, NULL, 'w' }, - { "help", 0, NULL, OPT_HELP }, { NULL, 0, NULL, 0 }, }; @@ -966,23 +983,27 @@ int main(int argc, char **argv) if (err) bad_args("Error compiling regex: %s", err); break; + case 's': + if (current_format()->id != FORM_WS && current_format()->id != FORM_REGEX) + bad_args("--sloppy makes sense only for --ws or --regex."); + current_format()->sloppy = 1; + break; case 't': set_format(FORM_TSV); break; case 'w': set_format(FORM_WS); break; - case 'W': - set_format(FORM_WS); - current_format()->strict_ws = 1; - break; case OPT_ALWAYS_QUOTE: if (current_format()->id != FORM_CSV) - bad_args("--always-quote makes sense only for CSV."); + bad_args("--always-quote makes sense only for --csv."); current_format()->always_quote = 1; break; case OPT_HELP: usage(); + case OPT_VERSION: + puts("This is xsv version " VERSION "."); + exit(0); case OPT_TRIM: want_trim = 1; break;