]> mj.ucw.cz Git - xsv.git/blobdiff - xsv.c
Man page: Fixed a couple of formatting typos
[xsv.git] / xsv.c
diff --git a/xsv.c b/xsv.c
index 32da50480651ecd4d78968e7820a18e628020ebc..b815c7205a774d09d88335496bde4bc7b38c584a 100644 (file)
--- a/xsv.c
+++ b/xsv.c
@@ -1,11 +1,9 @@
 /*
- *     A Swiss-Army Knife for CSV-like Files
+ *     The Swiss-Army Knife for CSV-like Files
  *
  *     (c) 2012 Martin Mares <mj@ucw.cz>
  */
 
-#define _GNU_SOURCE
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -24,6 +22,9 @@
 #define UNUSED
 #endif
 
+static void select_fields(void);
+static void select_all_fields(void);
+
 /*** General functions ***/
 
 static void NONRET die(char *msg, ...)
@@ -99,6 +100,7 @@ struct format {
        int fs;
        int quote;
        int quiet;
+       int sloppy;
        int (*read_line)(struct format *fmt);
        void (*write_line)(struct format *fmt);
        void (*write_grid)(struct format *fmt, int pos);        // -1=above, 1=below, 0=after header
@@ -112,9 +114,6 @@ struct format {
        // CSV backend:
        int always_quote;
 
-       // WS backend:
-       int strict_ws;
-
        // regex backend:
        pcre *pcre;
        pcre_extra *pcre_extra;
@@ -128,7 +127,7 @@ struct format {
 };
 
 static struct format *in_format, *out_format;
-static int want_trim;
+static int want_trim, want_equalize, want_stats;
 
 struct field {
        int start_pos;
@@ -242,6 +241,9 @@ static intarray_t column_widths;
 
 static void update_stats(void)
 {
+       if (!want_stats)
+               return;
+
        for (int i = 0; i < fields_count(&out_fields); i++) {
                struct field *f = fields_nth(&out_fields, i);
                intarray_t *w = &column_widths;
@@ -356,7 +358,7 @@ static int ws_read(struct format *fmt)
                        if (ws) {
                                if (!in_field->start_pos &&
                                    !in_field->len &&
-                                   !fmt->strict_ws)
+                                   fmt->sloppy)
                                        in_field->start_pos = i;
                                else
                                        new_field(i);
@@ -366,7 +368,7 @@ static int ws_read(struct format *fmt)
                }
        }
 
-       if (ws && fmt->strict_ws)
+       if (ws && !fmt->sloppy)
                new_field(n);
        return 1;
 }
@@ -401,17 +403,27 @@ static int regex_read(struct format *fmt)
        int i = 0;
        for (;;) {
                int ovec[3];
-               int sep = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
-               if (sep < 0) {
-                       if (sep != PCRE_ERROR_NOMATCH)
-                               warn(fmt, "PCRE matching error %d", sep);
+               int err = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
+               if (err < 0) {
+                       if (err != PCRE_ERROR_NOMATCH)
+                               warn(fmt, "PCRE matching error %d", err);
                        // No further occurrence of the separator: the rest is a single field
+                       if (!fmt->sloppy || i < n) {
+                               new_field(i);
+                               in_field->len = n - i;
+                       }
+                       return 1;
+               }
+               if (ovec[0] == ovec[1]) {
+                       warn(fmt, "Regular expression matched an empty separator.");
                        new_field(i);
                        in_field->len = n - i;
                        return 1;
                }
-               new_field(i);
-               in_field->len = ovec[0] - i;
+               if (!fmt->sloppy || ovec[0]) {
+                       new_field(i);
+                       in_field->len = ovec[0] - i;
+               }
                i = ovec[1];
        }
 }
@@ -434,7 +446,7 @@ static void table_write(struct format *fmt)
                        unsigned char *p = get_field(&out_fields, i, &len);
                        fw = field_chars(fields_nth(&out_fields, i));
                        if (fw > cw) {
-                               warn(fmt, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
+                               warn(fmt, "Internal error: Wrongly calculated width of column %d (%d > %d)", i, fw, cw);
                                cw = fw;
                        }
                        while (len--)
@@ -461,7 +473,7 @@ static void table_write_grid(struct format *fmt, int pos UNUSED)
 
        for (int i = 0; i < intarray_count(&column_widths); i++) {
                putchar_unlocked('+');
-               int w = fmt->table_sep + *intarray_nth(&column_widths, i);      // FIXME: Avoid the *
+               int w = fmt->table_sep + *intarray_nth(&column_widths, i);
                while (w--)
                        putchar('-');
        }
@@ -491,10 +503,8 @@ static int tmp_read(struct format *fmt)
                in_field->len = c;
                while (c--) {
                        int x = getc_unlocked(tf);
-                       if (x < 0) {
-                               warn(fmt, "Truncated temporary file");
-                               return 0;
-                       }
+                       if (x < 0)
+                               die("Truncated temporary file");
                        *line_push(&in_line) = x;
                }
        }
@@ -544,6 +554,14 @@ static void trim_fields(void)
        }
 }
 
+static void equalize_fields(void)
+{
+       while (fields_count(&out_fields) < intarray_count(&column_widths)) {
+               struct field *f = fields_push(&out_fields);
+               f->start_pos = f->len = 0;
+       }
+}
+
 /*** Field names and headers ***/
 
 struct field_names {
@@ -600,20 +618,22 @@ static void write_header(void)
                return;
        }
 
+       int want_select_fields = 0;
        if (out_format->set_field_names) {
                struct field_names *fn = xmalloc_zero(sizeof(*fn));
                out_format->field_names = fn;
                add_field_names(fn, out_format->set_field_names);
-       } else if (in_format->field_names)
+       } else if (in_format->field_names) {
                out_format->field_names = in_format->field_names;
-       else
+               want_select_fields = 1;
+       } else
                die("Output header requested, but no field names specified");
 
        line_reset(&in_line);
-       fields_reset(&out_fields);
+       fields_reset(&in_fields);
        struct field_names *fn = out_format->field_names;
        for (int i = 0; i < stringarray_count(&fn->names); i++) {
-               struct field *f = fields_push(&out_fields);
+               struct field *f = fields_push(&in_fields);
                f->start_pos = line_count(&in_line);
                f->len = 0;
                char *s = *stringarray_nth(&fn->names, i);
@@ -623,10 +643,20 @@ static void write_header(void)
                }
        }
 
+       fields_reset(&out_fields);
+       if (want_select_fields)
+               select_fields();
+       else
+               select_all_fields();
+
        // This is tricky: when we are formatting a table, field names are normally
        // calculated in pass 1, but the header is written in pass 2, so we have to
        // update column statistics, because field name can be too wide to fit.
+       want_stats++;
        update_stats();
+       want_stats--;
+       if (want_equalize)
+               equalize_fields();
        write_grid(-1);
        write_line();
        write_grid(0);
@@ -681,7 +711,7 @@ static int parse_field(char *str)
        if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0)
                return f;
 
-       die("Unknown field %s", str);
+       die("Unknown field `%s'", str);
 }
 
 static char *parse_selector(char *str)
@@ -754,9 +784,9 @@ static void one_pass(int pass)
                else
                        select_all_fields();
 
-               if (out_format->needs_stats)
-                       update_stats();
-
+               if (want_equalize && (pass & 2))
+                       equalize_fields();
+               update_stats();
                write_line();
        }
 
@@ -777,7 +807,6 @@ static void two_pass(void)
        out_format->read_line = tmp_read;
        out_format->write_line = tmp_write;
        out_format->tmp_file = tmpfile();
-       out_format->needs_stats = final_format->needs_stats;
        out_format->field_names = in_format->field_names;
        one_pass(1);
 
@@ -786,7 +815,7 @@ static void two_pass(void)
        rewind(in_format->tmp_file);
        line_number = 0;
        out_format = final_format;
-       out_format->needs_stats = 0;
+       want_stats = 0;
        one_pass(2);
        fclose(in_format->tmp_file);
 }
@@ -799,10 +828,9 @@ static void NONRET usage(void)
 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
 \n\
 Formats:\n\
--t, --tsv              TAB-separated values (default)\n\
+-t, --tsv              Tab-separated values (default)\n\
 -c, --csv              Comma-separated values\n\
 -w, --ws               Values separated by arbitrary whitespace\n\
--W, --strict-ws                Like --ws, but recognize empty columns at start/end\n\
 -r, --regex=<rx>       Separator given by Perl regular expression (input only)\n\
     --table            Format a table (output only)\n\
 \n\
@@ -814,9 +842,11 @@ Format parameters:\n\
     --always-quote     Put quotes around all fields (CSV output only)\n\
     --table-sep=<n>    Separate table columns by <n> spaces (default: 2)\n\
     --grid             Separate table columns by grid lines\n\
+-s, --sloppy           Ignore separators at the start/end of line (ws/regex only)\n\
 \n\
 Other options:\n\
     --trim             Trim leading and trailing whitespaces in fields\n\
+    --equalize         Pad all lines to the maximum number of fields\n\
 ");
        exit(0);
 }
@@ -839,29 +869,33 @@ static const char short_options[] = "cd:f:hqr:twW";
 
 enum long_options {
        OPT_HELP = 256,
+       OPT_VERSION,
        OPT_TRIM,
        OPT_ALWAYS_QUOTE,
        OPT_TABLE,
        OPT_TABLE_SEP,
        OPT_GRID,
+       OPT_EQUALIZE,
 };
 
 static const struct option long_options[] = {
        { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
        { "csv",                0,      NULL,   'c' },
+       { "equalize",           0,      NULL,   OPT_EQUALIZE },
        { "fields",             1,      NULL,   'f' },
        { "fs",                 1,      NULL,   'd' },
        { "grid",               0,      NULL,   OPT_GRID },
        { "header",             0,      NULL,   'h' },
+       { "help",               0,      NULL,   OPT_HELP },
        { "quiet",              0,      NULL,   'q' },
        { "regex",              1,      NULL,   'r' },
-       { "strict-ws",          0,      NULL,   'W' },
+       { "sloppy",             0,      NULL,   's' },
        { "table",              0,      NULL,   OPT_TABLE },
        { "table-sep",          1,      NULL,   OPT_TABLE_SEP },
        { "trim",               0,      NULL,   OPT_TRIM },
        { "tsv",                0,      NULL,   't' },
+       { "version",            0,      NULL,   OPT_VERSION },
        { "ws",                 0,      NULL,   'w' },
-       { "help",               0,      NULL,   OPT_HELP },
        { NULL,                 0,      NULL,   0 },
 };
 
@@ -949,23 +983,27 @@ int main(int argc, char **argv)
                                if (err)
                                        bad_args("Error compiling regex: %s", err);
                                break;
+                       case 's':
+                               if (current_format()->id != FORM_WS && current_format()->id != FORM_REGEX)
+                                       bad_args("--sloppy makes sense only for --ws or --regex.");
+                               current_format()->sloppy = 1;
+                               break;
                        case 't':
                                set_format(FORM_TSV);
                                break;
                        case 'w':
                                set_format(FORM_WS);
                                break;
-                       case 'W':
-                               set_format(FORM_WS);
-                               current_format()->strict_ws = 1;
-                               break;
                        case OPT_ALWAYS_QUOTE:
                                if (current_format()->id != FORM_CSV)
-                                       bad_args("--always-quote makes sense only for CSV.");
+                                       bad_args("--always-quote makes sense only for --csv.");
                                current_format()->always_quote = 1;
                                break;
                        case OPT_HELP:
                                usage();
+                       case OPT_VERSION:
+                               puts("This is xsv version " VERSION ".");
+                               exit(0);
                        case OPT_TRIM:
                                want_trim = 1;
                                break;
@@ -978,6 +1016,9 @@ int main(int argc, char **argv)
                        case OPT_GRID:
                                current_format()->table_grid = 1;
                                break;
+                       case OPT_EQUALIZE:
+                               want_equalize = 1;
+                               break;
                        default:
                                bad_args(NULL);
                }
@@ -998,7 +1039,8 @@ int main(int argc, char **argv)
        }
        finish_parse_selectors();
 
-       if (out_format->needs_stats)
+       want_stats = out_format->needs_stats | want_equalize;
+       if (want_stats)
                two_pass();
        else
                one_pass(3);