]> mj.ucw.cz Git - xsv.git/blobdiff - xsv.c
Table backend is locale-aware
[xsv.git] / xsv.c
diff --git a/xsv.c b/xsv.c
index 7ec94da6440aee49a39f600d3a714829354a6a92..1c6dada778a925c3b7eb49859c990ca580987236 100644 (file)
--- a/xsv.c
+++ b/xsv.c
@@ -9,6 +9,8 @@
 #include <string.h>
 #include <stdarg.h>
 #include <getopt.h>
+#include <wchar.h>
+#include <locale.h>
 
 #include <pcre.h>
 
@@ -24,6 +26,13 @@ static void *xmalloc(size_t bytes)
        return p;
 }
 
+static void *xmalloc_zero(size_t bytes)
+{
+       void *p = xmalloc(bytes);
+       memset(p, 0, bytes);
+       return p;
+}
+
 static void *xrealloc(void *old, size_t bytes)
 {
        void *p = realloc(old, bytes);
@@ -51,6 +60,8 @@ static void *xrealloc(void *old, size_t bytes)
        static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
        // end
 
+DECLARE_BUF(intarray, int);
+
 /*** Formats and their parameters ***/
 
 enum format_id {
@@ -59,6 +70,8 @@ enum format_id {
        FORM_CSV,
        FORM_WS,
        FORM_REGEX,
+       FORM_TMP,
+       FORM_TABLE,
 };
 
 struct format {
@@ -68,14 +81,28 @@ struct format {
        int quiet;
        int (*read_line)(void);
        void (*write_line)(void);
+       int needs_two_passes;
+
        // CSV backend:
        int always_quote;
+
+       // WS backend:
+       int strict_ws;
+
        // regex backend:
        pcre *pcre;
        pcre_extra *pcre_extra;
+
+       // Temporary file backend:
+       FILE *tmp_file;
+       intarray_t column_widths;
+
+       // Table backend:
+       int table_sep;
 };
 
 static struct format *in_format, *out_format;
+static int want_trim;
 
 struct field {
        int start_pos;
@@ -90,17 +117,17 @@ static struct field *in_field;
 static line_t in_line;
 static int line_number;
 
-static void new_field(void)
+static void new_field(int pos)
 {
        in_field = fields_push(&in_fields);
-       in_field->start_pos = line_count(&in_line);
+       in_field->start_pos = pos;
        in_field->len = 0;
 }
 
-static void ensure_field(void)
+static void ensure_field(int pos)
 {
        if (!in_field)
-               new_field();
+               new_field(pos);
 }
 
 static void warn(struct format *fmt, char *msg, ...)
@@ -129,11 +156,33 @@ static int next_line(void)
        }
 }
 
+static int field_chars(struct field *f)
+{
+       unsigned char *s = line_nth(&in_line, f->start_pos);
+       int i = 0;
+       mbstate_t mbs;
+       memset(&mbs, 0, sizeof(mbs));
+
+       int chars = 0;
+       while (i < f->len) {
+               size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
+               if ((int) k <= 0)
+                       break;
+               i += k;
+               chars++;
+       }
+
+       return chars;
+}
+
+/*** CSV/TSV back-end */
+
 static int csv_read(void)
 {
        int quoted = 0;
        for (;;) {
                int c = getchar();
+               int i = line_count(&in_line);
 restart:
                if (c == '\r')
                        continue;
@@ -159,11 +208,11 @@ restart:
                        quoted = 1;
                        continue;
                } else if (c == in_format->fs && !quoted) {
-                       ensure_field();
-                       new_field();
+                       ensure_field(i);
+                       new_field(i);
                        continue;
                }
-               ensure_field();
+               ensure_field(i);
                *line_push(&in_line) = c;
                in_field->len++;
        }
@@ -207,31 +256,45 @@ static void csv_write(void)
        putchar('\n');
 }
 
+/*** White-space back-end ***/
+
 static int ws_read(void)
 {
+       if (!next_line())
+               return 0;
+
+       unsigned char *line = line_first(&in_line);
+       int n = line_count(&in_line);
+       if (!n)
+               return 1;
+
        int ws = 0;
-       for (;;) {
-               int c = getchar();
-               if (c < 0)
-                       return !!fields_count(&in_fields);
-               if (c == '\r')
-                       continue;
-               if (c == '\n')
-                       return 1;
+       new_field(0);
+       for (int i=0; i<n; i++) {
+               int c = line[i];
                if (is_ws(c)) {
-                       ensure_field();
-                       if (!ws)
-                               new_field();
                        ws++;
                } else {
-                       ensure_field();
-                       *line_push(&in_line) = c;
+                       if (ws) {
+                               if (!in_field->start_pos &&
+                                   !in_field->len &&
+                                   !in_format->strict_ws)
+                                       in_field->start_pos = i;
+                               else
+                                       new_field(i);
+                               ws = 0;
+                       }
                        in_field->len++;
-                       ws = 0;
                }
        }
+
+       if (ws && in_format->strict_ws)
+               new_field(n);
+       return 1;
 }
 
+/*** Regex back-end ***/
+
 static const char *regex_set(struct format *f, char *rx)
 {
        const char *err;
@@ -265,18 +328,102 @@ static int regex_read(void)
                        if (sep != PCRE_ERROR_NOMATCH)
                                warn(in_format, "PCRE matching error %d", sep);
                        // No further occurrence of the separator: the rest is a single field
-                       new_field();
-                       in_field->start_pos = i;
+                       new_field(i);
                        in_field->len = n - i;
                        return 1;
                }
-               new_field();
-               in_field->start_pos = i;
+               new_field(i);
                in_field->len = ovec[0] - i;
                i = ovec[1];
        }
 }
 
+/*** Table back-end ***/
+
+static void table_write(void)
+{
+       for (int i = 0; i < fields_count(&in_fields); i++) {
+               if (i)
+                       printf("%*s", out_format->table_sep, "");
+               struct field *f = fields_nth(&in_fields, i);
+               int fw = field_chars(f);
+               int cw = *intarray_nth(&in_format->column_widths, i);
+               if (fw > cw) {
+                       warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
+                       cw = fw;
+               }
+               unsigned char *p = line_nth(&in_line, f->start_pos);
+               for (int j = 0; j < f->len; j++)
+                       putchar(p[j]);
+               while (fw < cw) {
+                       putchar(' ');
+                       fw++;
+               }
+       }
+       putchar('\n');
+}
+
+/*** Temporary file back-end ***/
+
+static int tmp_read(void)
+{
+       FILE *tf = in_format->tmp_file;
+
+       for (;;) {
+               int c = fgetc(tf);
+               if (c < 0)
+                       return 0;
+               if (c == 0xff)
+                       return 1;
+               if (c == 0xfe) {
+                       c = fgetc(tf);
+                       c = (c << 8) | fgetc(tf);
+                       c = (c << 8) | fgetc(tf);
+                       c = (c << 8) | fgetc(tf);
+               }
+               new_field(line_count(&in_line));
+               in_field->len = c;
+               while (c--) {
+                       int x = fgetc(tf);
+                       if (x < 0) {
+                               warn(in_format, "Truncated temporary file");
+                               return 0;
+                       }
+                       *line_push(&in_line) = x;
+               }
+       }
+}
+
+static void tmp_write(void)
+{
+       FILE *tf = out_format->tmp_file;
+
+       for (int i = 0; i < fields_count(&in_fields); i++) {
+               struct field *f = fields_nth(&in_fields, i);
+               if (f->len < 0xfe)
+                       fputc(f->len, tf);
+               else {
+                       fputc(0xfe, tf);
+                       fputc((f->len >> 24) & 0xff, tf);
+                       fputc((f->len >> 16) & 0xff, tf);
+                       fputc((f->len >> 8) & 0xff, tf);
+                       fputc(f->len & 0xff, tf);
+               }
+
+               unsigned char *p = line_nth(&in_line, f->start_pos);
+               for (int j = 0; j < f->len; j++)
+                       fputc(*p++, tf);
+
+               intarray_t *w = &out_format->column_widths;
+               while (i >= intarray_count(w))
+                       *intarray_push(w) = 0;
+               int fw = field_chars(f);
+               if (*intarray_nth(w, i) < fw)
+                       *intarray_nth(w, i) = fw;
+       }
+       fputc(0xff, tf);
+}
+
 /*** Transforms ***/
 
 static void trim_fields(void)
@@ -343,6 +490,53 @@ static void select_fields(void)
        }
 }
 
+/*** Processing of files ***/
+
+static void one_pass(void)
+{
+       line_number = 0;
+       for (;;) {
+               line_number++;
+               fields_reset(&in_fields);
+               line_reset(&in_line);
+               in_field = NULL;
+               if (!in_format->read_line())
+                       break;
+
+               if (want_trim)
+                       trim_fields();
+
+               fields_reset(&out_fields);
+               select_fields();
+
+               out_format->write_line();
+       }
+}
+
+static void two_pass(void)
+{
+       struct format *final_format = out_format;
+
+       // We need to use character set info from the current locale
+       setlocale(LC_CTYPE, "");
+
+       // Pass 1: Set up writer of intermediate format
+       out_format = xmalloc_zero(sizeof(*out_format));
+       out_format->id = FORM_TMP;
+       out_format->read_line = tmp_read;
+       out_format->write_line = tmp_write;
+       out_format->tmp_file = tmpfile();
+       intarray_init(&out_format->column_widths);
+       one_pass();
+
+       // Pass 2: Set up reader of intermediate format
+       in_format = out_format;
+       rewind(in_format->tmp_file);
+       out_format = final_format;
+       one_pass();
+       fclose(in_format->tmp_file);
+}
+
 /*** Parsing of arguments ***/
 
 static void usage(void)
@@ -354,12 +548,15 @@ Formats:\n\
 -t, --tsv              TAB-separated values (default)\n\
 -c, --csv              Comma-separated values\n\
 -w, --ws               Values separated by arbitrary whitespace\n\
+-W, --strict-ws                Like --ws, but recognize empty columns at start/end\n\
 -r, --regex=<rx>       Separator given by Perl regular expression (input only)\n\
+    --table            Format a table (output only)\n\
 \n\
 Format parameters:\n\
 -d, --fs=<char>                Delimiter of fields\n\
 -q, --quiet            Do not show warnings\n\
     --always-quote     Put quotes around all fields (CSV output only)\n\
+    --table-sep=<n>    Separate table columns by <n> spaces (default: 2)\n\
 \n\
 Other options:\n\
     --trim             Trim leading and trailing whitespaces in fields\n\
@@ -381,12 +578,14 @@ static void bad_args(const char *msg, ...)
        exit(1);
 }
 
-static const char short_options[] = "cd:qr:tw";
+static const char short_options[] = "cd:qr:twW";
 
 enum long_options {
        OPT_HELP = 256,
        OPT_TRIM,
        OPT_ALWAYS_QUOTE,
+       OPT_TABLE,
+       OPT_TABLE_SEP,
 };
 
 static const struct option long_options[] = {
@@ -395,6 +594,9 @@ static const struct option long_options[] = {
        { "fs",                 1,      NULL,   'd' },
        { "quiet",              0,      NULL,   'q' },
        { "regex",              1,      NULL,   'r' },
+       { "strict-ws",          0,      NULL,   'W' },
+       { "table",              0,      NULL,   OPT_TABLE },
+       { "table-sep",          1,      NULL,   OPT_TABLE_SEP },
        { "trim",               0,      NULL,   OPT_TRIM },
        { "tsv",                0,      NULL,   't' },
        { "ws",                 0,      NULL,   'w' },
@@ -404,8 +606,7 @@ static const struct option long_options[] = {
 
 static void set_format(int format_id)
 {
-       struct format *f = xmalloc(sizeof(*f));
-       memset(f, 0, sizeof(*f));
+       struct format *f = xmalloc_zero(sizeof(*f));
        f->id = format_id;
 
        switch (format_id) {
@@ -430,6 +631,11 @@ static void set_format(int format_id)
                case FORM_REGEX:
                        f->read_line = regex_read;
                        break;
+               case FORM_TABLE:
+                       f->write_line = table_write;
+                       f->needs_two_passes = 1;
+                       f->table_sep = 2;
+                       break;
        }
 
        if (!in_format)
@@ -437,7 +643,7 @@ static void set_format(int format_id)
        else if (!out_format)
                out_format = f;
        else
-               bad_args("At most two format may be given.");
+               bad_args("At most two formats may be given.");
 }
 
 static struct format *current_format(void)
@@ -453,7 +659,6 @@ static struct format *current_format(void)
 int main(int argc, char **argv)
 {
        int opt;
-       int want_trim = 0;
        const char *err;
 
        while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
@@ -482,6 +687,10 @@ int main(int argc, char **argv)
                        case 'w':
                                set_format(FORM_WS);
                                break;
+                       case 'W':
+                               set_format(FORM_WS);
+                               current_format()->strict_ws = 1;
+                               break;
                        case OPT_ALWAYS_QUOTE:
                                if (current_format()->id != FORM_CSV)
                                        bad_args("--always-quote makes sense only for CSV.");
@@ -492,6 +701,12 @@ int main(int argc, char **argv)
                        case OPT_TRIM:
                                want_trim = 1;
                                break;
+                       case OPT_TABLE:
+                               set_format(FORM_TABLE);
+                               break;
+                       case OPT_TABLE_SEP:
+                               current_format()->table_sep = atoi(optarg);
+                               break;
                        default:
                                bad_args(NULL);
                }
@@ -515,22 +730,9 @@ int main(int argc, char **argv)
        fields_init(&out_fields);
        line_init(&in_line);
 
-       for (;;) {
-               line_number++;
-               fields_reset(&in_fields);
-               line_reset(&in_line);
-               in_field = NULL;
-               if (!in_format->read_line())
-                       break;
-
-               if (want_trim)
-                       trim_fields();
-
-               fields_reset(&out_fields);
-               select_fields();
-
-               out_format->write_line();
-       }
-
+       if (out_format->needs_two_passes)
+               two_pass();
+       else
+               one_pass();
        return 0;
 }