From 111d107062382f160e8d3dcbef2b5c3c8160e1da Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Mon, 23 Jul 2012 23:15:06 +0200 Subject: [PATCH] Added --regex format and --always-quote for --csv --- xsv.c | 134 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 119 insertions(+), 15 deletions(-) diff --git a/xsv.c b/xsv.c index c21a97f..7ec94da 100644 --- a/xsv.c +++ b/xsv.c @@ -7,8 +7,11 @@ #include #include #include +#include #include +#include + /*** Memory allocation ***/ static void *xmalloc(size_t bytes) @@ -44,6 +47,7 @@ static void *xrealloc(void *old, size_t bytes) if (b->count >= b->max) name##_extend(b); \ return &b->start[b->count++]; \ } \ + static inline type *name##_first(name##_t *b) { return b->start; } \ static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \ // end @@ -54,6 +58,7 @@ enum format_id { FORM_TSV, FORM_CSV, FORM_WS, + FORM_REGEX, }; struct format { @@ -63,6 +68,11 @@ struct format { int quiet; int (*read_line)(void); void (*write_line)(void); + // CSV backend: + int always_quote; + // regex backend: + pcre *pcre; + pcre_extra *pcre_extra; }; static struct format *in_format, *out_format; @@ -93,10 +103,30 @@ static void ensure_field(void) new_field(); } -static void warn(struct format *fmt, char *msg) +static void warn(struct format *fmt, char *msg, ...) +{ + if (!fmt->quiet) { + fprintf(stderr, "Warning at line %d: ", line_number); + va_list args; + va_start(args, msg); + vfprintf(stderr, args, msg); + va_end(args); + fputc('\n', stderr); + } +} + +static int next_line(void) { - if (!fmt->quiet) - fprintf(stderr, "Warning at line %d: %s\n", line_number, msg); + for (;;) { + int c = getchar(); + if (c == '\r') + continue; + if (c < 0) + return !!line_count(&in_line); + if (c == '\n') + return 1; + *line_push(&in_line) = c; + } } static int csv_read(void) @@ -146,18 +176,17 @@ static int is_ws(int c) static void csv_write(void) { - unsigned char *line = line_nth(&in_line, 0); + unsigned char *line = line_first(&in_line); int n = fields_count(&out_fields); for (int i=0; iquote >= 0) { - for (int j=0; j < f->len; j++) { + need_quotes = out_format->always_quote; + for (int j=0; !need_quotes && j < f->len; j++) { int c = line[f->start_pos + j]; - if (c == out_format->fs || c == out_format->quote) { + if (c == out_format->fs || c == out_format->quote) need_quotes = 1; - break; - } } } if (i) @@ -203,11 +232,56 @@ static int ws_read(void) } } +static const char *regex_set(struct format *f, char *rx) +{ + const char *err; + int errpos; + f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL); + if (!f->pcre) + return err; + + f->pcre_extra = pcre_study(f->pcre, 0, &err); + if (!f->pcre_extra) + return err; + + return NULL; +} + +static int regex_read(void) +{ + if (!next_line()) + return 0; + + unsigned char *c = line_first(&in_line); + int n = line_count(&in_line); + if (!n) + return 1; + + int i = 0; + for (;;) { + int ovec[3]; + int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3); + if (sep < 0) { + if (sep != PCRE_ERROR_NOMATCH) + warn(in_format, "PCRE matching error %d", sep); + // No further occurrence of the separator: the rest is a single field + new_field(); + in_field->start_pos = i; + in_field->len = n - i; + return 1; + } + new_field(); + in_field->start_pos = i; + in_field->len = ovec[0] - i; + i = ovec[1]; + } +} + /*** Transforms ***/ static void trim_fields(void) { - unsigned char *line = line_nth(&in_line, 0); + unsigned char *line = line_first(&in_line); for (int i = 0; i < fields_count(&in_fields); i++) { struct field *f = fields_nth(&in_fields, i); while (f->len && is_ws(line[f->start_pos])) @@ -280,10 +354,12 @@ Formats:\n\ -t, --tsv TAB-separated values (default)\n\ -c, --csv Comma-separated values\n\ -w, --ws Values separated by arbitrary whitespace\n\ +-r, --regex= Separator given by Perl regular expression (input only)\n\ \n\ Format parameters:\n\ -d, --fs= Delimiter of fields\n\ -q, --quiet Do not show warnings\n\ + --always-quote Put quotes around all fields (CSV output only)\n\ \n\ Other options:\n\ --trim Trim leading and trailing whitespaces in fields\n\ @@ -291,25 +367,34 @@ Other options:\n\ exit(0); } -static void bad_args(char *msg) +static void bad_args(const char *msg, ...) { - if (msg) - fprintf(stderr, "xsv: %s\n", msg); + if (msg) { + va_list args; + va_start(args, msg); + fprintf(stderr, "xsv: "); + vfprintf(stderr, msg, args); + fputc('\n', stderr); + va_end(args); + } fprintf(stderr, "Try `xsv --help' for more information.\n"); exit(1); } -static const char short_options[] = "cd:qtw"; +static const char short_options[] = "cd:qr:tw"; enum long_options { OPT_HELP = 256, - OPT_TRIM = 257, + OPT_TRIM, + OPT_ALWAYS_QUOTE, }; static const struct option long_options[] = { + { "always-quote", 0, NULL, OPT_ALWAYS_QUOTE }, { "csv", 0, NULL, 'c' }, { "fs", 1, NULL, 'd' }, { "quiet", 0, NULL, 'q' }, + { "regex", 1, NULL, 'r' }, { "trim", 0, NULL, OPT_TRIM }, { "tsv", 0, NULL, 't' }, { "ws", 0, NULL, 'w' }, @@ -342,6 +427,9 @@ static void set_format(int format_id) f->read_line = ws_read; f->write_line = csv_write; break; + case FORM_REGEX: + f->read_line = regex_read; + break; } if (!in_format) @@ -366,6 +454,7 @@ int main(int argc, char **argv) { int opt; int want_trim = 0; + const char *err; while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0) switch (opt) { @@ -381,12 +470,23 @@ int main(int argc, char **argv) case 'q': current_format()->quiet = 1; break; + case 'r': + set_format(FORM_REGEX); + err = regex_set(current_format(), optarg); + if (err) + bad_args("Error compiling regex: %s", err); + break; case 't': set_format(FORM_TSV); break; case 'w': set_format(FORM_WS); break; + case OPT_ALWAYS_QUOTE: + if (current_format()->id != FORM_CSV) + bad_args("--always-quote makes sense only for CSV."); + current_format()->always_quote = 1; + break; case OPT_HELP: usage(); case OPT_TRIM: @@ -399,9 +499,13 @@ int main(int argc, char **argv) current_format(); if (!out_format) out_format = in_format; + if (!in_format->read_line) + bad_args("Write-only format selected for input."); + if (!out_format->write_line) + bad_args("Read-only format selected for output."); for (int i = optind; i < argc; i++) { - char *err = parse_selector(argv[i]); + err = parse_selector(argv[i]); if (err) bad_args(err); } -- 2.39.2