2 * The Swiss-Army Knife for CSV-like Files
4 * (c) 2012 Martin Mares <mj@ucw.cz>
18 #define NONRET __attribute__((noreturn))
19 #define UNUSED __attribute__((unused))
25 static void select_fields(void);
26 static void select_all_fields(void);
28 /*** General functions ***/
30 static void NONRET die(char *msg, ...)
34 fprintf(stderr, "xsv: ");
35 vfprintf(stderr, msg, args);
41 /*** Memory allocation ***/
43 static void *xmalloc(size_t bytes)
45 void *p = malloc(bytes);
47 die("Out of memory (cannot allocate %zu bytes)", bytes);
51 static void *xmalloc_zero(size_t bytes)
53 void *p = xmalloc(bytes);
58 static void *xrealloc(void *old, size_t bytes)
60 void *p = realloc(old, bytes);
62 die("Out of memory (cannot allocate %zu bytes)", bytes);
66 #define DECLARE_BUF(name, type) \
67 typedef struct { type *start; int count; int max; } name##_t; \
68 static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
69 static inline void name##_reset(name##_t *b) { b->count = 0; } \
70 static inline int name##_count(name##_t *b) { return b->count; } \
71 static void name##_extend(name##_t *b) { \
72 b->max = b->max ? 2*b->max : 16; \
73 b->start = xrealloc(b->start, b->max * sizeof(type)); \
75 static inline type *name##_push(name##_t *b) { \
76 if (b->count >= b->max) name##_extend(b); \
77 return &b->start[b->count++]; \
79 static inline type *name##_first(name##_t *b) { return b->start; } \
80 static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \
83 DECLARE_BUF(intarray, int);
84 DECLARE_BUF(stringarray, char *);
86 /*** Formats and their parameters ***/
104 int (*read_line)(struct format *fmt);
105 void (*write_line)(struct format *fmt);
106 void (*write_grid)(struct format *fmt, int pos); // -1=above, 1=below, 0=after header
111 char *set_field_names;
112 struct field_names *field_names;
119 pcre_extra *pcre_extra;
121 // Temporary file backend:
129 static struct format *in_format, *out_format;
130 static int want_trim, want_equalize, want_stats;
137 DECLARE_BUF(fields, struct field);
138 DECLARE_BUF(line, unsigned char);
140 static fields_t in_fields, out_fields;
141 static struct field *in_field;
142 static line_t in_line;
143 static int line_number;
145 static int read_line(void)
147 fields_reset(&in_fields);
148 line_reset(&in_line);
150 if (!in_format->read_line(in_format))
152 if (ferror_unlocked(stdin))
153 die("I/O error when reading standard input");
157 static void write_line(void)
159 out_format->write_line(out_format);
160 if (ferror_unlocked(stdout))
161 die("I/O error when writing standard input");
164 static void write_grid(int pos)
166 if (out_format->write_grid) {
167 out_format->write_grid(out_format, pos);
168 if (ferror_unlocked(stdout))
169 die("I/O error when writing standard input");
173 static void new_field(int pos)
175 in_field = fields_push(&in_fields);
176 in_field->start_pos = pos;
180 static void ensure_field(int pos)
186 static unsigned char *get_field(fields_t *fields, int i, int *len)
188 struct field *f = fields_nth(fields, i);
190 return line_nth(&in_line, f->start_pos);
193 static void warn(struct format *fmt, char *msg, ...)
196 fprintf(stderr, "Warning at line %d: ", line_number);
199 vfprintf(stderr, msg, args);
205 static int next_line(void)
208 int c = getchar_unlocked();
212 return !!line_count(&in_line);
215 *line_push(&in_line) = c;
219 static int field_chars(struct field *f)
221 unsigned char *s = line_nth(&in_line, f->start_pos);
224 memset(&mbs, 0, sizeof(mbs));
228 size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
238 /*** Field statistics ***/
240 static intarray_t column_widths;
242 static void update_stats(void)
247 for (int i = 0; i < fields_count(&out_fields); i++) {
248 struct field *f = fields_nth(&out_fields, i);
249 intarray_t *w = &column_widths;
251 while (i >= intarray_count(w))
252 *intarray_push(w) = 0;
253 int fw = field_chars(f);
254 if (*intarray_nth(w, i) < fw)
255 *intarray_nth(w, i) = fw;
259 /*** CSV/TSV back-end */
261 static int csv_read(struct format *fmt)
265 int c = getchar_unlocked();
266 int i = line_count(&in_line);
270 if (c < 0 || c == '\n') {
272 warn(fmt, "Missing closing quote.");
274 return !!fields_count(&in_fields);
279 if (c == fmt->quote) {
280 c = getchar_unlocked();
281 if (c != fmt->quote) {
285 // Two quotes assimilate to one
287 // Fall through to pushing the character
288 } else if (c == fmt->quote) {
291 } else if (c == fmt->fs && !quoted) {
297 *line_push(&in_line) = c;
302 static int is_ws(int c)
304 return (c == ' ' || c == '\t' || c == '\f');
307 static void csv_write(struct format *fmt)
309 for (int i=0; i < fields_count(&out_fields); i++) {
311 unsigned char *p = get_field(&out_fields, i, &len);
314 if (fmt->quote >= 0) {
315 need_quotes = fmt->always_quote;
316 for (int j=0; !need_quotes && j < len; j++) {
317 if (p[j] == fmt->fs || p[j] == fmt->quote)
322 putchar_unlocked(fmt->fs);
324 putchar_unlocked(fmt->quote);
325 for (int j=0; j < len; j++) {
327 if (c == fmt->fs && !need_quotes)
328 warn(fmt, "Field separator found inside field and quoting is turned off.");
334 putchar_unlocked(fmt->quote);
336 putchar_unlocked('\n');
339 /*** White-space back-end ***/
341 static int ws_read(struct format *fmt)
346 unsigned char *line = line_first(&in_line);
347 int n = line_count(&in_line);
353 for (int i=0; i<n; i++) {
359 if (!in_field->start_pos &&
362 in_field->start_pos = i;
371 if (ws && !fmt->sloppy)
376 /*** Regex back-end ***/
378 static const char *regex_set(struct format *f, char *rx)
382 f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
386 f->pcre_extra = pcre_study(f->pcre, 0, &err);
393 static int regex_read(struct format *fmt)
398 unsigned char *c = line_first(&in_line);
399 int n = line_count(&in_line);
406 int err = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
408 if (err != PCRE_ERROR_NOMATCH)
409 warn(fmt, "PCRE matching error %d", err);
410 // No further occurrence of the separator: the rest is a single field
411 if (!fmt->sloppy || i < n) {
413 in_field->len = n - i;
417 if (ovec[0] == ovec[1]) {
418 warn(fmt, "Regular expression matched an empty separator.");
420 in_field->len = n - i;
423 if (!fmt->sloppy || ovec[0]) {
425 in_field->len = ovec[0] - i;
431 /*** Table back-end ***/
433 static void table_write(struct format *fmt)
435 for (int i = 0; i < intarray_count(&column_widths); i++) {
436 if (fmt->table_grid) {
437 putchar_unlocked('|');
438 printf("%*s", fmt->table_sep / 2, "");
440 printf("%*s", fmt->table_sep, "");
442 int cw = *intarray_nth(&column_widths, i);
444 if (i < fields_count(&out_fields)) {
446 unsigned char *p = get_field(&out_fields, i, &len);
447 fw = field_chars(fields_nth(&out_fields, i));
449 warn(fmt, "Internal error: Wrongly calculated width of column %d (%d > %d)", i, fw, cw);
453 putchar_unlocked(*p++);
456 putchar_unlocked(' ');
461 printf("%*s", fmt->table_sep - fmt->table_sep / 2, "");
465 putchar_unlocked('|');
466 putchar_unlocked('\n');
469 static void table_write_grid(struct format *fmt, int pos UNUSED)
471 if (!fmt->table_grid)
474 for (int i = 0; i < intarray_count(&column_widths); i++) {
475 putchar_unlocked('+');
476 int w = fmt->table_sep + *intarray_nth(&column_widths, i);
478 putchar_unlocked('-');
480 putchar_unlocked('+');
481 putchar_unlocked('\n');
484 /*** Temporary file back-end ***/
486 static int tmp_read(struct format *fmt)
488 FILE *tf = fmt->tmp_file;
491 int c = getc_unlocked(tf);
497 c = getc_unlocked(tf);
498 c = (c << 8) | getc_unlocked(tf);
499 c = (c << 8) | getc_unlocked(tf);
500 c = (c << 8) | getc_unlocked(tf);
502 new_field(line_count(&in_line));
505 int x = getc_unlocked(tf);
507 die("Truncated temporary file");
508 *line_push(&in_line) = x;
512 if (ferror_unlocked(tf))
513 die("I/O error when reading temporary file");
516 static void tmp_write(struct format *fmt)
518 FILE *tf = fmt->tmp_file;
520 for (int i = 0; i < fields_count(&out_fields); i++) {
522 unsigned char *p = get_field(&out_fields, i, &len);
525 putc_unlocked(len, tf);
527 putc_unlocked(0xfe, tf);
528 putc_unlocked((len >> 24) & 0xff, tf);
529 putc_unlocked((len >> 16) & 0xff, tf);
530 putc_unlocked((len >> 8) & 0xff, tf);
531 putc_unlocked(len & 0xff, tf);
535 putc_unlocked(*p++, tf);
537 putc_unlocked(0xff, tf);
539 if (ferror_unlocked(tf))
540 die("I/O error when writing temporary file");
545 static void trim_fields(void)
547 unsigned char *line = line_first(&in_line);
548 for (int i = 0; i < fields_count(&in_fields); i++) {
549 struct field *f = fields_nth(&in_fields, i);
550 while (f->len && is_ws(line[f->start_pos]))
551 f->start_pos++, f->len--;
552 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
557 static void equalize_fields(void)
559 while (fields_count(&out_fields) < intarray_count(&column_widths)) {
560 struct field *f = fields_push(&out_fields);
561 f->start_pos = f->len = 0;
565 /*** Field names and headers ***/
571 static void add_field(struct field_names *fn, char *name, int namelen)
573 char *n = xmalloc(namelen + 1);
574 memcpy(n, name, namelen);
576 *stringarray_push(&fn->names) = n;
579 static void add_field_names(struct field_names *fn, char *names)
583 char *q = strchr(p, ',');
584 int len = q ? q-p : (int) strlen(p);
585 add_field(fn, p, len);
590 static void read_header(void)
592 if (!(in_format->has_header || in_format->set_field_names))
595 struct field_names *fn = xmalloc_zero(sizeof(*fn));
596 in_format->field_names = fn;
598 if (in_format->has_header) {
600 die("Missing input header");
603 if (in_format->set_field_names) {
604 add_field_names(fn, in_format->set_field_names);
606 for (int i = 0; i < fields_count(&in_fields); i++) {
608 char *s = (char *) get_field(&in_fields, i, &len);
609 add_field(fn, s, len);
614 static void write_header(void)
616 if (!out_format->has_header) {
621 int want_select_fields = 0;
622 if (out_format->set_field_names) {
623 struct field_names *fn = xmalloc_zero(sizeof(*fn));
624 out_format->field_names = fn;
625 add_field_names(fn, out_format->set_field_names);
626 } else if (in_format->field_names) {
627 out_format->field_names = in_format->field_names;
628 want_select_fields = 1;
630 die("Output header requested, but no field names specified");
632 line_reset(&in_line);
633 fields_reset(&in_fields);
634 struct field_names *fn = out_format->field_names;
635 for (int i = 0; i < stringarray_count(&fn->names); i++) {
636 struct field *f = fields_push(&in_fields);
637 f->start_pos = line_count(&in_line);
639 char *s = *stringarray_nth(&fn->names, i);
641 *line_push(&in_line) = *s++;
646 fields_reset(&out_fields);
647 if (want_select_fields)
652 // This is tricky: when we are formatting a table, field names are normally
653 // calculated in pass 1, but the header is written in pass 2, so we have to
654 // update column statistics, because field name can be too wide to fit.
665 static void write_footer(void)
670 static int find_field_by_name(struct field_names *fn, char *name)
672 for (int i = 0; i < stringarray_count(&fn->names); i++)
673 if (!strcmp(*stringarray_nth(&fn->names, i), name))
678 /*** Field selection ***/
681 int first_field, last_field; // 0 means "boundary"
684 DECLARE_BUF(selectors, struct selector);
685 static selectors_t selectors;
687 static int parse_field_num(char *str)
692 if (*str < '0' || *str > '9')
696 f = 10*f + *str - '0';
702 static int parse_field(char *str)
707 int f = parse_field_num(str);
711 if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0)
714 die("Unknown field `%s'", str);
717 static char *parse_selector(char *str)
719 char buf[strlen(str) + 1];
722 struct selector *s = selectors_push(&selectors);
723 char *sep = strchr(buf, '-');
726 s->first_field = parse_field(buf);
727 s->last_field = parse_field(sep);
729 s->first_field = s->last_field = parse_field(buf);
734 static void finish_parse_selectors(void)
736 if (!selectors_count(&selectors))
740 static void select_fields(void)
742 for (int i = 0; i < selectors_count(&selectors); i++) {
743 struct selector *s = selectors_nth(&selectors, i);
744 int first = s->first_field;
747 int last = s->last_field;
749 last = fields_count(&in_fields);
750 for (int j = first; j <= last; j++) {
751 struct field *f = fields_push(&out_fields);
752 if (j >= 1 && j <= fields_count(&in_fields))
753 *f = *fields_nth(&in_fields, j-1);
755 f->start_pos = f->len = 0;
760 static void select_all_fields(void)
762 for (int i = 0; i < fields_count(&in_fields); i++)
763 *fields_push(&out_fields) = *fields_nth(&in_fields, i);
766 /*** Processing of files ***/
768 static void one_pass(int pass)
778 if (want_trim && (pass & 1))
781 fields_reset(&out_fields);
787 if (want_equalize && (pass & 2))
797 static void two_pass(void)
799 struct format *final_format = out_format;
801 // We need to use character set info from the current locale
802 setlocale(LC_CTYPE, "");
804 // Pass 1: Set up writer of intermediate format
805 out_format = xmalloc_zero(sizeof(*out_format));
806 out_format->id = FORM_TMP;
807 out_format->read_line = tmp_read;
808 out_format->write_line = tmp_write;
809 out_format->tmp_file = tmpfile();
810 out_format->field_names = in_format->field_names;
813 // Pass 2: Set up reader of intermediate format
814 in_format = out_format;
815 rewind(in_format->tmp_file);
817 out_format = final_format;
820 fclose(in_format->tmp_file);
823 /*** Parsing of arguments ***/
825 static void NONRET usage(void)
828 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
831 -t, --tsv Tab-separated values (default)\n\
832 -c, --csv Comma-separated values\n\
833 -w, --ws Values separated by arbitrary whitespace\n\
834 -r, --regex=<rx> Separator given by Perl regular expression (input only)\n\
835 --table Format a table (output only)\n\
837 Format parameters:\n\
838 -d, --fs=<char> Delimiter of fields\n\
839 -f, --fields=<f>,... Set field names\n\
840 -h, --header The first line contains field names\n\
841 -q, --quiet Do not show warnings\n\
842 --always-quote Put quotes around all fields (CSV output only)\n\
843 --table-sep=<n> Separate table columns by <n> spaces (default: 2)\n\
844 --grid Separate table columns by grid lines\n\
845 -s, --sloppy Ignore separators at the start/end of line (ws/regex only)\n\
848 --trim Trim leading and trailing whitespaces in fields\n\
849 --equalize Pad all lines to the maximum number of fields\n\
854 static void NONRET bad_args(const char *msg, ...)
859 fprintf(stderr, "xsv: ");
860 vfprintf(stderr, msg, args);
864 fprintf(stderr, "Try `xsv --help' for more information.\n");
868 static const char short_options[] = "cd:f:hqr:twW";
881 static const struct option long_options[] = {
882 { "always-quote", 0, NULL, OPT_ALWAYS_QUOTE },
883 { "csv", 0, NULL, 'c' },
884 { "equalize", 0, NULL, OPT_EQUALIZE },
885 { "fields", 1, NULL, 'f' },
886 { "fs", 1, NULL, 'd' },
887 { "grid", 0, NULL, OPT_GRID },
888 { "header", 0, NULL, 'h' },
889 { "help", 0, NULL, OPT_HELP },
890 { "quiet", 0, NULL, 'q' },
891 { "regex", 1, NULL, 'r' },
892 { "sloppy", 0, NULL, 's' },
893 { "table", 0, NULL, OPT_TABLE },
894 { "table-sep", 1, NULL, OPT_TABLE_SEP },
895 { "trim", 0, NULL, OPT_TRIM },
896 { "tsv", 0, NULL, 't' },
897 { "version", 0, NULL, OPT_VERSION },
898 { "ws", 0, NULL, 'w' },
899 { NULL, 0, NULL, 0 },
902 static void set_format(int format_id)
904 struct format *f = xmalloc_zero(sizeof(*f));
911 f->read_line = csv_read;
912 f->write_line = csv_write;
917 f->read_line = csv_read;
918 f->write_line = csv_write;
923 f->read_line = ws_read;
924 f->write_line = csv_write;
927 f->read_line = regex_read;
930 f->write_line = table_write;
931 f->write_grid = table_write_grid;
939 else if (!out_format)
942 bad_args("At most two formats may be given.");
945 static struct format *current_format(void)
951 set_format(FORM_TSV);
955 int main(int argc, char **argv)
960 while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
963 set_format(FORM_CSV);
967 current_format()->fs = optarg[0];
969 bad_args("No field delimiter given.");
972 current_format()->set_field_names = optarg;
975 current_format()->has_header = 1;
978 current_format()->quiet = 1;
981 set_format(FORM_REGEX);
982 err = regex_set(current_format(), optarg);
984 bad_args("Error compiling regex: %s", err);
987 if (current_format()->id != FORM_WS && current_format()->id != FORM_REGEX)
988 bad_args("--sloppy makes sense only for --ws or --regex.");
989 current_format()->sloppy = 1;
992 set_format(FORM_TSV);
997 case OPT_ALWAYS_QUOTE:
998 if (current_format()->id != FORM_CSV)
999 bad_args("--always-quote makes sense only for --csv.");
1000 current_format()->always_quote = 1;
1005 puts("This is xsv version " VERSION ".");
1011 set_format(FORM_TABLE);
1014 current_format()->table_sep = atoi(optarg);
1017 current_format()->table_grid = 1;
1028 out_format = in_format;
1029 if (!in_format->read_line)
1030 bad_args("Write-only format selected for input.");
1031 if (!out_format->write_line)
1032 bad_args("Read-only format selected for output.");
1035 for (int i = optind; i < argc; i++) {
1036 err = parse_selector(argv[i]);
1040 finish_parse_selectors();
1042 want_stats = out_format->needs_stats | want_equalize;