]> mj.ucw.cz Git - xsv.git/commitdiff
First version: --csv and --tsv works
authorMartin Mares <mj@ucw.cz>
Mon, 23 Jul 2012 18:14:22 +0000 (20:14 +0200)
committerMartin Mares <mj@ucw.cz>
Mon, 23 Jul 2012 18:14:22 +0000 (20:14 +0200)
xsv.c [new file with mode: 0644]

diff --git a/xsv.c b/xsv.c
new file mode 100644 (file)
index 0000000..fa7964d
--- /dev/null
+++ b/xsv.c
@@ -0,0 +1,356 @@
+/*
+ *     A Swiss-Army Knife for CSV-like Files
+ *
+ *     (c) 2012 Martin Mares <mj@ucw.cz>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+
+/*** Memory allocation ***/
+
+static void *xmalloc(size_t bytes)
+{
+       void *p = malloc(bytes);
+       if (!p) {
+               fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
+               exit(1);
+       }
+       return p;
+}
+
+static void *xrealloc(void *old, size_t bytes)
+{
+       void *p = realloc(old, bytes);
+       if (!p) {
+               fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
+               exit(1);
+       }
+       return p;
+}
+
+#define DECLARE_BUF(name, type) \
+       typedef struct { type *start; int count; int max; } name##_t;                           \
+       static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
+       static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
+       static inline int name##_count(name##_t *b) { return b->count; }                        \
+       static void name##_extend(name##_t *b) {                                                \
+               b->max = b->max ? 2*b->max : 16;                                                \
+               b->start = xrealloc(b->start, b->max * sizeof(type));                           \
+       }                                                                                       \
+       static inline type *name##_push(name##_t *b) {                                          \
+               if (b->count >= b->max) name##_extend(b);                                       \
+               return &b->start[b->count++];                                                   \
+       }                                                                                       \
+       static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
+       // end
+
+/*** Formats and their parameters ***/
+
+enum format_id {
+       FORM_UNSPEC,
+       FORM_TSV,
+       FORM_CSV,
+       FORM_WS,
+};
+
+struct format {
+       enum format_id id;
+       int fs;
+       int quote;
+       int (*read_line)(void);
+       void (*write_line)(void);
+};
+
+static struct format *in_format, *out_format;
+
+struct field {
+       int start_pos;
+       int len;
+};
+
+DECLARE_BUF(fields, struct field);
+DECLARE_BUF(line, unsigned char);
+
+static fields_t in_fields, out_fields;
+static struct field *in_field;
+static line_t in_line;
+
+static void new_field(void)
+{
+       in_field = fields_push(&in_fields);
+       in_field->start_pos = line_count(&in_line);
+       in_field->len = 0;
+}
+
+static void ensure_field(void)
+{
+       if (!in_field)
+               new_field();
+}
+
+static int csv_read(void)
+{
+       int quoted = 0;
+       // FIXME: Complain if closing quote is missing?
+       for (;;) {
+               int c = getchar();
+restart:
+               if (c < 0)
+                       return !!fields_count(&in_fields);
+               if (c == '\r')
+                       continue;
+               if (c == '\n')
+                       return 1;
+               if (quoted) {
+                       if (c == in_format->quote) {
+                               c = getchar();
+                               if (c != in_format->quote) {
+                                       quoted = 0;
+                                       goto restart;
+                               }
+                               // Two quotes assimilate to one
+                       }
+                       // Fall through to pushing the character
+               } else if (c == in_format->quote) {
+                       quoted = 1;
+                       continue;
+               } else if (c == in_format->fs && !quoted) {
+                       ensure_field();
+                       new_field();
+                       continue;
+               }
+               ensure_field();
+               *line_push(&in_line) = c;
+               in_field->len++;
+       }
+}
+
+static void csv_write(void)
+{
+       unsigned char *line = line_nth(&in_line, 0);
+       int n = fields_count(&out_fields);
+       for (int i=0; i<n; i++) {
+               struct field *f = fields_nth(&out_fields, i);
+               int need_quotes = 0;
+               if (out_format->quote >= 0) {
+                       for (int j=0; j < f->len; j++) {
+                               int c = line[f->start_pos + j];
+                               if (c == out_format->fs || c == out_format->quote) {
+                                       need_quotes = 1;
+                                       break;
+                               }
+                       }
+               }
+               if (i)
+                       putchar(out_format->fs);
+               if (need_quotes)
+                       putchar(out_format->quote);
+               for (int j=0; j < f->len; j++) {
+                       int c = line[f->start_pos + j];
+                       if (c == out_format->quote)
+                               putchar(c);
+                       putchar(c);
+               }
+               if (need_quotes)
+                       putchar(out_format->quote);
+       }
+       putchar('\n');
+}
+
+/*** Field selection ***/
+
+struct selector {
+       int first_field, last_field;
+};
+
+DECLARE_BUF(selectors, struct selector);
+static selectors_t selectors;
+
+static char *parse_selector(char *str)
+{
+       char buf[strlen(str) + 1];
+       strcpy(buf, str);
+
+       struct selector *s = selectors_push(&selectors);
+       char *sep = strchr(buf, '-');
+       if (sep) {
+               *sep++ = 0;
+               s->first_field = atoi(buf);
+               s->last_field = atoi(sep);
+       } else
+               s->first_field = s->last_field = atoi(buf);
+
+       return NULL;
+}
+
+static void finish_parse_selectors(void)
+{
+       if (!selectors_count(&selectors))
+               parse_selector("-");
+}
+
+static void select_fields(void)
+{
+       for (int i = 0; i < selectors_count(&selectors); i++) {
+               struct selector *s = selectors_nth(&selectors, i);
+               int first = s->first_field;
+               if (first <= 0)
+                       first = 1;
+               int last = s->last_field;
+               if (last <= 0)
+                       last = fields_count(&in_fields);
+               for (int j = first; j <= last; j++) {
+                       struct field *f = fields_push(&out_fields);
+                       if (j >= 1 && j <= fields_count(&in_fields))
+                               *f = *fields_nth(&in_fields, j-1);
+                       else
+                               f->start_pos = f->len = 0;
+               }
+       }
+}
+
+/*** Parsing of arguments ***/
+
+static void usage(void)
+{
+       printf("\
+Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
+\n\
+Formats:\n\
+-t, --tsv              TAB-separated values (default)\n\
+-c, --csv              Comma-separated values\n\
+-w, --ws               Values separated by arbitrary whitespace\n\
+\n\
+Format parameters:\n\
+-d, --fs=<char>        Delimiter of fields\n\
+\n\
+Other options:\n\
+(so far none)\n\
+");
+       exit(0);
+}
+
+static void bad_args(char *msg)
+{
+       if (msg)
+               fprintf(stderr, "xsv: %s\n", msg);
+       fprintf(stderr, "Try `xsv --help' for more information.\n");
+       exit(1);
+}
+
+static const char short_options[] = "cd:tw";
+
+enum long_options {
+       OPT_HELP = 256,
+};
+
+static const struct option long_options[] = {
+       { "csv",                0,      NULL,   'c' },
+       { "fs",                 1,      NULL,   'd' },
+       { "tsv",                0,      NULL,   't' },
+       { "ws",                 0,      NULL,   'w' },
+       { "help",               0,      NULL,   OPT_HELP },
+       { NULL,                 0,      NULL,   0 },
+};
+
+static void set_format(int format_id)
+{
+       struct format *f = xmalloc(sizeof(*f));
+       memset(f, 0, sizeof(*f));
+       f->id = format_id;
+
+       switch (format_id) {
+               case FORM_TSV:
+                       f->fs = '\t';
+                       f->quote = -1;
+                       f->read_line = csv_read;
+                       f->write_line = csv_write;
+                       break;
+               case FORM_CSV:
+                       f->fs = ',';
+                       f->quote = '"';
+                       f->read_line = csv_read;
+                       f->write_line = csv_write;
+                       break;
+               case FORM_WS:
+                       break;
+       }
+
+       if (!in_format)
+               in_format = f;
+       else if (!out_format)
+               out_format = f;
+       else
+               bad_args("At most two format may be given.");
+}
+
+static struct format *current_format(void)
+{
+       if (out_format)
+               return out_format;
+       if (in_format)
+               return in_format;
+       set_format(FORM_TSV);
+       return in_format;
+}
+
+int main(int argc, char **argv)
+{
+       int opt;
+
+       while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
+               switch (opt) {
+                       case 'c':
+                               set_format(FORM_CSV);
+                               break;
+                       case 'd':
+                               if (optarg[0])
+                                       current_format()->fs = optarg[0];
+                               else
+                                       bad_args("No field delimiter given.");
+                               break;
+                       case 't':
+                               set_format(FORM_TSV);
+                               break;
+                       case 'w':
+                               set_format(FORM_WS);
+                               break;
+                       case OPT_HELP:
+                               usage();
+                       default:
+                               bad_args(NULL);
+               }
+
+       current_format();
+       if (!out_format)
+               out_format = in_format;
+
+       for (int i = optind; i < argc; i++) {
+               char *err = parse_selector(argv[i]);
+               if (err)
+                       bad_args(err);
+       }
+       finish_parse_selectors();
+
+       fields_init(&in_fields);
+       fields_init(&out_fields);
+       line_init(&in_line);
+
+       for (;;) {
+               fields_reset(&in_fields);
+               line_reset(&in_line);
+               in_field = NULL;
+               if (!in_format->read_line())
+                       break;
+
+               fields_reset(&out_fields);
+               select_fields();
+
+               out_format->write_line();
+       }
+
+       return 0;
+}