]> mj.ucw.cz Git - xsv.git/blobdiff - xsv.c
Simplified CSV and WS parsers
[xsv.git] / xsv.c
diff --git a/xsv.c b/xsv.c
index fa7964da7727328dfc087dd1c080bdcb4944d4b3..4ad9e477714db146b918eb600e8f5ba4d9f10301 100644 (file)
--- a/xsv.c
+++ b/xsv.c
@@ -7,8 +7,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdarg.h>
 #include <getopt.h>
 
+#include <pcre.h>
+
 /*** Memory allocation ***/
 
 static void *xmalloc(size_t bytes)
@@ -44,6 +47,7 @@ static void *xrealloc(void *old, size_t bytes)
                if (b->count >= b->max) name##_extend(b);                                       \
                return &b->start[b->count++];                                                   \
        }                                                                                       \
+       static inline type *name##_first(name##_t *b) { return b->start; }                      \
        static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
        // end
 
@@ -54,14 +58,21 @@ enum format_id {
        FORM_TSV,
        FORM_CSV,
        FORM_WS,
+       FORM_REGEX,
 };
 
 struct format {
        enum format_id id;
        int fs;
        int quote;
+       int quiet;
        int (*read_line)(void);
        void (*write_line)(void);
+       // CSV backend:
+       int always_quote;
+       // regex backend:
+       pcre *pcre;
+       pcre_extra *pcre_extra;
 };
 
 static struct format *in_format, *out_format;
@@ -77,33 +88,64 @@ DECLARE_BUF(line, unsigned char);
 static fields_t in_fields, out_fields;
 static struct field *in_field;
 static line_t in_line;
+static int line_number;
 
-static void new_field(void)
+static void new_field(int pos)
 {
        in_field = fields_push(&in_fields);
-       in_field->start_pos = line_count(&in_line);
+       in_field->start_pos = pos;
        in_field->len = 0;
 }
 
-static void ensure_field(void)
+static void ensure_field(int pos)
 {
        if (!in_field)
-               new_field();
+               new_field(pos);
+}
+
+static void warn(struct format *fmt, char *msg, ...)
+{
+       if (!fmt->quiet) {
+               fprintf(stderr, "Warning at line %d: ", line_number);
+               va_list args;
+               va_start(args, msg);
+               vfprintf(stderr, args, msg);
+               va_end(args);
+               fputc('\n', stderr);
+       }
+}
+
+static int next_line(void)
+{
+       for (;;) {
+               int c = getchar();
+               if (c == '\r')
+                       continue;
+               if (c < 0)
+                       return !!line_count(&in_line);
+               if (c == '\n')
+                       return 1;
+               *line_push(&in_line) = c;
+       }
 }
 
 static int csv_read(void)
 {
        int quoted = 0;
-       // FIXME: Complain if closing quote is missing?
        for (;;) {
                int c = getchar();
+               int i = line_count(&in_line);
 restart:
-               if (c < 0)
-                       return !!fields_count(&in_fields);
                if (c == '\r')
                        continue;
-               if (c == '\n')
-                       return 1;
+               if (c < 0 || c == '\n') {
+                       if (quoted)
+                               warn(in_format, "Missing closing quote.");
+                       if (c < 0)
+                               return !!fields_count(&in_fields);
+                       else
+                               return 1;
+               }
                if (quoted) {
                        if (c == in_format->quote) {
                                c = getchar();
@@ -118,30 +160,34 @@ restart:
                        quoted = 1;
                        continue;
                } else if (c == in_format->fs && !quoted) {
-                       ensure_field();
-                       new_field();
+                       ensure_field(i);
+                       new_field(i);
                        continue;
                }
-               ensure_field();
+               ensure_field(i);
                *line_push(&in_line) = c;
                in_field->len++;
        }
 }
 
+static int is_ws(int c)
+{
+       return (c == ' ' || c == '\t' || c == '\f');
+}
+
 static void csv_write(void)
 {
-       unsigned char *line = line_nth(&in_line, 0);
+       unsigned char *line = line_first(&in_line);
        int n = fields_count(&out_fields);
        for (int i=0; i<n; i++) {
                struct field *f = fields_nth(&out_fields, i);
                int need_quotes = 0;
                if (out_format->quote >= 0) {
-                       for (int j=0; j < f->len; j++) {
+                       need_quotes = out_format->always_quote;
+                       for (int j=0; !need_quotes && j < f->len; j++) {
                                int c = line[f->start_pos + j];
-                               if (c == out_format->fs || c == out_format->quote) {
+                               if (c == out_format->fs || c == out_format->quote)
                                        need_quotes = 1;
-                                       break;
-                               }
                        }
                }
                if (i)
@@ -150,6 +196,8 @@ static void csv_write(void)
                        putchar(out_format->quote);
                for (int j=0; j < f->len; j++) {
                        int c = line[f->start_pos + j];
+                       if (c == out_format->fs && !need_quotes)
+                               warn(out_format, "Field separator found inside field and quoting is turned off.");
                        if (c == out_format->quote)
                                putchar(c);
                        putchar(c);
@@ -160,6 +208,93 @@ static void csv_write(void)
        putchar('\n');
 }
 
+static int ws_read(void)
+{
+       if (!next_line())
+               return 0;
+
+       unsigned char *line = line_first(&in_line);
+       int n = line_count(&in_line);
+       if (!n)
+               return 1;
+
+       int ws = 0;
+       new_field(0);
+       for (int i=0; i<n; i++) {
+               int c = line[i];
+               if (is_ws(c)) {
+                       ws++;
+               } else {
+                       if (ws) {
+                               new_field(i);
+                               ws = 0;
+                       }
+                       in_field->len++;
+               }
+       }
+
+       if (ws)
+               new_field(n);
+       return 1;
+}
+
+static const char *regex_set(struct format *f, char *rx)
+{
+       const char *err;
+       int errpos;
+       f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
+       if (!f->pcre)
+               return err;
+
+       f->pcre_extra = pcre_study(f->pcre, 0, &err);
+       if (!f->pcre_extra)
+               return err;
+
+       return NULL;
+}
+
+static int regex_read(void)
+{
+       if (!next_line())
+               return 0;
+
+       unsigned char *c = line_first(&in_line);
+       int n = line_count(&in_line);
+       if (!n)
+               return 1;
+
+       int i = 0;
+       for (;;) {
+               int ovec[3];
+               int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3);
+               if (sep < 0) {
+                       if (sep != PCRE_ERROR_NOMATCH)
+                               warn(in_format, "PCRE matching error %d", sep);
+                       // No further occurrence of the separator: the rest is a single field
+                       new_field(i);
+                       in_field->len = n - i;
+                       return 1;
+               }
+               new_field(i);
+               in_field->len = ovec[0] - i;
+               i = ovec[1];
+       }
+}
+
+/*** Transforms ***/
+
+static void trim_fields(void)
+{
+       unsigned char *line = line_first(&in_line);
+       for (int i = 0; i < fields_count(&in_fields); i++) {
+               struct field *f = fields_nth(&in_fields, i);
+               while (f->len && is_ws(line[f->start_pos]))
+                       f->start_pos++, f->len--;
+               while (f->len && is_ws(line[f->start_pos + f->len - 1]))
+                       f->len--;
+       }
+}
+
 /*** Field selection ***/
 
 struct selector {
@@ -223,33 +358,48 @@ Formats:\n\
 -t, --tsv              TAB-separated values (default)\n\
 -c, --csv              Comma-separated values\n\
 -w, --ws               Values separated by arbitrary whitespace\n\
+-r, --regex=<rx>       Separator given by Perl regular expression (input only)\n\
 \n\
 Format parameters:\n\
--d, --fs=<char>        Delimiter of fields\n\
+-d, --fs=<char>                Delimiter of fields\n\
+-q, --quiet            Do not show warnings\n\
+    --always-quote     Put quotes around all fields (CSV output only)\n\
 \n\
 Other options:\n\
-(so far none)\n\
+    --trim             Trim leading and trailing whitespaces in fields\n\
 ");
        exit(0);
 }
 
-static void bad_args(char *msg)
+static void bad_args(const char *msg, ...)
 {
-       if (msg)
-               fprintf(stderr, "xsv: %s\n", msg);
+       if (msg) {
+               va_list args;
+               va_start(args, msg);
+               fprintf(stderr, "xsv: ");
+               vfprintf(stderr, msg, args);
+               fputc('\n', stderr);
+               va_end(args);
+       }
        fprintf(stderr, "Try `xsv --help' for more information.\n");
        exit(1);
 }
 
-static const char short_options[] = "cd:tw";
+static const char short_options[] = "cd:qr:tw";
 
 enum long_options {
        OPT_HELP = 256,
+       OPT_TRIM,
+       OPT_ALWAYS_QUOTE,
 };
 
 static const struct option long_options[] = {
+       { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
        { "csv",                0,      NULL,   'c' },
        { "fs",                 1,      NULL,   'd' },
+       { "quiet",              0,      NULL,   'q' },
+       { "regex",              1,      NULL,   'r' },
+       { "trim",               0,      NULL,   OPT_TRIM },
        { "tsv",                0,      NULL,   't' },
        { "ws",                 0,      NULL,   'w' },
        { "help",               0,      NULL,   OPT_HELP },
@@ -276,6 +426,13 @@ static void set_format(int format_id)
                        f->write_line = csv_write;
                        break;
                case FORM_WS:
+                       f->fs = ' ';
+                       f->quote = -1;
+                       f->read_line = ws_read;
+                       f->write_line = csv_write;
+                       break;
+               case FORM_REGEX:
+                       f->read_line = regex_read;
                        break;
        }
 
@@ -300,6 +457,8 @@ static struct format *current_format(void)
 int main(int argc, char **argv)
 {
        int opt;
+       int want_trim = 0;
+       const char *err;
 
        while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
                switch (opt) {
@@ -312,14 +471,31 @@ int main(int argc, char **argv)
                                else
                                        bad_args("No field delimiter given.");
                                break;
+                       case 'q':
+                               current_format()->quiet = 1;
+                               break;
+                       case 'r':
+                               set_format(FORM_REGEX);
+                               err = regex_set(current_format(), optarg);
+                               if (err)
+                                       bad_args("Error compiling regex: %s", err);
+                               break;
                        case 't':
                                set_format(FORM_TSV);
                                break;
                        case 'w':
                                set_format(FORM_WS);
                                break;
+                       case OPT_ALWAYS_QUOTE:
+                               if (current_format()->id != FORM_CSV)
+                                       bad_args("--always-quote makes sense only for CSV.");
+                               current_format()->always_quote = 1;
+                               break;
                        case OPT_HELP:
                                usage();
+                       case OPT_TRIM:
+                               want_trim = 1;
+                               break;
                        default:
                                bad_args(NULL);
                }
@@ -327,9 +503,13 @@ int main(int argc, char **argv)
        current_format();
        if (!out_format)
                out_format = in_format;
+       if (!in_format->read_line)
+               bad_args("Write-only format selected for input.");
+       if (!out_format->write_line)
+               bad_args("Read-only format selected for output.");
 
        for (int i = optind; i < argc; i++) {
-               char *err = parse_selector(argv[i]);
+               err = parse_selector(argv[i]);
                if (err)
                        bad_args(err);
        }
@@ -340,12 +520,16 @@ int main(int argc, char **argv)
        line_init(&in_line);
 
        for (;;) {
+               line_number++;
                fields_reset(&in_fields);
                line_reset(&in_line);
                in_field = NULL;
                if (!in_format->read_line())
                        break;
 
+               if (want_trim)
+                       trim_fields();
+
                fields_reset(&out_fields);
                select_fields();