From: Martin Mares Date: Mon, 23 Jul 2012 18:14:22 +0000 (+0200) Subject: First version: --csv and --tsv works X-Git-Tag: v1.0~48 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=87fcc9b218c11a1b6115ebe9a534e9af4e4c3907;p=xsv.git First version: --csv and --tsv works --- 87fcc9b218c11a1b6115ebe9a534e9af4e4c3907 diff --git a/xsv.c b/xsv.c new file mode 100644 index 0000000..fa7964d --- /dev/null +++ b/xsv.c @@ -0,0 +1,356 @@ +/* + * A Swiss-Army Knife for CSV-like Files + * + * (c) 2012 Martin Mares + */ + +#include +#include +#include +#include + +/*** Memory allocation ***/ + +static void *xmalloc(size_t bytes) +{ + void *p = malloc(bytes); + if (!p) { + fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes); + exit(1); + } + return p; +} + +static void *xrealloc(void *old, size_t bytes) +{ + void *p = realloc(old, bytes); + if (!p) { + fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes); + exit(1); + } + return p; +} + +#define DECLARE_BUF(name, type) \ + typedef struct { type *start; int count; int max; } name##_t; \ + static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \ + static inline void name##_reset(name##_t *b) { b->count = 0; } \ + static inline int name##_count(name##_t *b) { return b->count; } \ + static void name##_extend(name##_t *b) { \ + b->max = b->max ? 2*b->max : 16; \ + b->start = xrealloc(b->start, b->max * sizeof(type)); \ + } \ + static inline type *name##_push(name##_t *b) { \ + if (b->count >= b->max) name##_extend(b); \ + return &b->start[b->count++]; \ + } \ + static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \ + // end + +/*** Formats and their parameters ***/ + +enum format_id { + FORM_UNSPEC, + FORM_TSV, + FORM_CSV, + FORM_WS, +}; + +struct format { + enum format_id id; + int fs; + int quote; + int (*read_line)(void); + void (*write_line)(void); +}; + +static struct format *in_format, *out_format; + +struct field { + int start_pos; + int len; +}; + +DECLARE_BUF(fields, struct field); +DECLARE_BUF(line, unsigned char); + +static fields_t in_fields, out_fields; +static struct field *in_field; +static line_t in_line; + +static void new_field(void) +{ + in_field = fields_push(&in_fields); + in_field->start_pos = line_count(&in_line); + in_field->len = 0; +} + +static void ensure_field(void) +{ + if (!in_field) + new_field(); +} + +static int csv_read(void) +{ + int quoted = 0; + // FIXME: Complain if closing quote is missing? + for (;;) { + int c = getchar(); +restart: + if (c < 0) + return !!fields_count(&in_fields); + if (c == '\r') + continue; + if (c == '\n') + return 1; + if (quoted) { + if (c == in_format->quote) { + c = getchar(); + if (c != in_format->quote) { + quoted = 0; + goto restart; + } + // Two quotes assimilate to one + } + // Fall through to pushing the character + } else if (c == in_format->quote) { + quoted = 1; + continue; + } else if (c == in_format->fs && !quoted) { + ensure_field(); + new_field(); + continue; + } + ensure_field(); + *line_push(&in_line) = c; + in_field->len++; + } +} + +static void csv_write(void) +{ + unsigned char *line = line_nth(&in_line, 0); + int n = fields_count(&out_fields); + for (int i=0; iquote >= 0) { + for (int j=0; j < f->len; j++) { + int c = line[f->start_pos + j]; + if (c == out_format->fs || c == out_format->quote) { + need_quotes = 1; + break; + } + } + } + if (i) + putchar(out_format->fs); + if (need_quotes) + putchar(out_format->quote); + for (int j=0; j < f->len; j++) { + int c = line[f->start_pos + j]; + if (c == out_format->quote) + putchar(c); + putchar(c); + } + if (need_quotes) + putchar(out_format->quote); + } + putchar('\n'); +} + +/*** Field selection ***/ + +struct selector { + int first_field, last_field; +}; + +DECLARE_BUF(selectors, struct selector); +static selectors_t selectors; + +static char *parse_selector(char *str) +{ + char buf[strlen(str) + 1]; + strcpy(buf, str); + + struct selector *s = selectors_push(&selectors); + char *sep = strchr(buf, '-'); + if (sep) { + *sep++ = 0; + s->first_field = atoi(buf); + s->last_field = atoi(sep); + } else + s->first_field = s->last_field = atoi(buf); + + return NULL; +} + +static void finish_parse_selectors(void) +{ + if (!selectors_count(&selectors)) + parse_selector("-"); +} + +static void select_fields(void) +{ + for (int i = 0; i < selectors_count(&selectors); i++) { + struct selector *s = selectors_nth(&selectors, i); + int first = s->first_field; + if (first <= 0) + first = 1; + int last = s->last_field; + if (last <= 0) + last = fields_count(&in_fields); + for (int j = first; j <= last; j++) { + struct field *f = fields_push(&out_fields); + if (j >= 1 && j <= fields_count(&in_fields)) + *f = *fields_nth(&in_fields, j-1); + else + f->start_pos = f->len = 0; + } + } +} + +/*** Parsing of arguments ***/ + +static void usage(void) +{ + printf("\ +Usage: xsv [] []\n\ +\n\ +Formats:\n\ +-t, --tsv TAB-separated values (default)\n\ +-c, --csv Comma-separated values\n\ +-w, --ws Values separated by arbitrary whitespace\n\ +\n\ +Format parameters:\n\ +-d, --fs= Delimiter of fields\n\ +\n\ +Other options:\n\ +(so far none)\n\ +"); + exit(0); +} + +static void bad_args(char *msg) +{ + if (msg) + fprintf(stderr, "xsv: %s\n", msg); + fprintf(stderr, "Try `xsv --help' for more information.\n"); + exit(1); +} + +static const char short_options[] = "cd:tw"; + +enum long_options { + OPT_HELP = 256, +}; + +static const struct option long_options[] = { + { "csv", 0, NULL, 'c' }, + { "fs", 1, NULL, 'd' }, + { "tsv", 0, NULL, 't' }, + { "ws", 0, NULL, 'w' }, + { "help", 0, NULL, OPT_HELP }, + { NULL, 0, NULL, 0 }, +}; + +static void set_format(int format_id) +{ + struct format *f = xmalloc(sizeof(*f)); + memset(f, 0, sizeof(*f)); + f->id = format_id; + + switch (format_id) { + case FORM_TSV: + f->fs = '\t'; + f->quote = -1; + f->read_line = csv_read; + f->write_line = csv_write; + break; + case FORM_CSV: + f->fs = ','; + f->quote = '"'; + f->read_line = csv_read; + f->write_line = csv_write; + break; + case FORM_WS: + break; + } + + if (!in_format) + in_format = f; + else if (!out_format) + out_format = f; + else + bad_args("At most two format may be given."); +} + +static struct format *current_format(void) +{ + if (out_format) + return out_format; + if (in_format) + return in_format; + set_format(FORM_TSV); + return in_format; +} + +int main(int argc, char **argv) +{ + int opt; + + while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0) + switch (opt) { + case 'c': + set_format(FORM_CSV); + break; + case 'd': + if (optarg[0]) + current_format()->fs = optarg[0]; + else + bad_args("No field delimiter given."); + break; + case 't': + set_format(FORM_TSV); + break; + case 'w': + set_format(FORM_WS); + break; + case OPT_HELP: + usage(); + default: + bad_args(NULL); + } + + current_format(); + if (!out_format) + out_format = in_format; + + for (int i = optind; i < argc; i++) { + char *err = parse_selector(argv[i]); + if (err) + bad_args(err); + } + finish_parse_selectors(); + + fields_init(&in_fields); + fields_init(&out_fields); + line_init(&in_line); + + for (;;) { + fields_reset(&in_fields); + line_reset(&in_line); + in_field = NULL; + if (!in_format->read_line()) + break; + + fields_reset(&out_fields); + select_fields(); + + out_format->write_line(); + } + + return 0; +}