2 * A Swiss-Army Knife for CSV-like Files
4 * (c) 2012 Martin Mares <mj@ucw.cz>
20 #define NONRET __attribute__((noreturn))
21 #define UNUSED __attribute__((unused))
27 /*** General functions ***/
29 static void NONRET die(char *msg, ...)
33 fprintf(stderr, "xsv: ");
34 vfprintf(stderr, msg, args);
40 /*** Memory allocation ***/
42 static void *xmalloc(size_t bytes)
44 void *p = malloc(bytes);
46 die("Out of memory (cannot allocate %zu bytes)", bytes);
50 static void *xmalloc_zero(size_t bytes)
52 void *p = xmalloc(bytes);
57 static void *xrealloc(void *old, size_t bytes)
59 void *p = realloc(old, bytes);
61 die("Out of memory (cannot allocate %zu bytes)", bytes);
65 #define DECLARE_BUF(name, type) \
66 typedef struct { type *start; int count; int max; } name##_t; \
67 static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
68 static inline void name##_reset(name##_t *b) { b->count = 0; } \
69 static inline int name##_count(name##_t *b) { return b->count; } \
70 static void name##_extend(name##_t *b) { \
71 b->max = b->max ? 2*b->max : 16; \
72 b->start = xrealloc(b->start, b->max * sizeof(type)); \
74 static inline type *name##_push(name##_t *b) { \
75 if (b->count >= b->max) name##_extend(b); \
76 return &b->start[b->count++]; \
78 static inline type *name##_first(name##_t *b) { return b->start; } \
79 static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \
82 DECLARE_BUF(intarray, int);
83 DECLARE_BUF(stringarray, char *);
85 /*** Formats and their parameters ***/
102 int (*read_line)(struct format *fmt);
103 void (*write_line)(struct format *fmt);
104 void (*write_grid)(struct format *fmt, int pos); // -1=above, 1=below, 0=after header
109 char *set_field_names;
110 struct field_names *field_names;
120 pcre_extra *pcre_extra;
122 // Temporary file backend:
130 static struct format *in_format, *out_format;
131 static int want_trim;
138 DECLARE_BUF(fields, struct field);
139 DECLARE_BUF(line, unsigned char);
141 static fields_t in_fields, out_fields;
142 static struct field *in_field;
143 static line_t in_line;
144 static int line_number;
146 static int read_line(void)
148 fields_reset(&in_fields);
149 line_reset(&in_line);
151 if (!in_format->read_line(in_format))
153 if (ferror_unlocked(stdin))
154 die("I/O error when reading standard input");
158 static void write_line(void)
160 out_format->write_line(out_format);
161 if (ferror_unlocked(stdout))
162 die("I/O error when writing standard input");
165 static void write_grid(int pos)
167 if (out_format->write_grid) {
168 out_format->write_grid(out_format, pos);
169 if (ferror_unlocked(stdout))
170 die("I/O error when writing standard input");
174 static void new_field(int pos)
176 in_field = fields_push(&in_fields);
177 in_field->start_pos = pos;
181 static void ensure_field(int pos)
187 static unsigned char *get_field(fields_t *fields, int i, int *len)
189 struct field *f = fields_nth(fields, i);
191 return line_nth(&in_line, f->start_pos);
194 static void warn(struct format *fmt, char *msg, ...)
197 fprintf(stderr, "Warning at line %d: ", line_number);
200 vfprintf(stderr, msg, args);
206 static int next_line(void)
209 int c = getchar_unlocked();
213 return !!line_count(&in_line);
216 *line_push(&in_line) = c;
220 static int field_chars(struct field *f)
222 unsigned char *s = line_nth(&in_line, f->start_pos);
225 memset(&mbs, 0, sizeof(mbs));
229 size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
239 /*** Field statistics ***/
241 static intarray_t column_widths;
243 static void update_stats(void)
245 for (int i = 0; i < fields_count(&out_fields); i++) {
246 struct field *f = fields_nth(&out_fields, i);
247 intarray_t *w = &column_widths;
249 while (i >= intarray_count(w))
250 *intarray_push(w) = 0;
251 int fw = field_chars(f);
252 if (*intarray_nth(w, i) < fw)
253 *intarray_nth(w, i) = fw;
257 /*** CSV/TSV back-end */
259 static int csv_read(struct format *fmt)
263 int c = getchar_unlocked();
264 int i = line_count(&in_line);
268 if (c < 0 || c == '\n') {
270 warn(fmt, "Missing closing quote.");
272 return !!fields_count(&in_fields);
277 if (c == fmt->quote) {
278 c = getchar_unlocked();
279 if (c != fmt->quote) {
283 // Two quotes assimilate to one
285 // Fall through to pushing the character
286 } else if (c == fmt->quote) {
289 } else if (c == fmt->fs && !quoted) {
295 *line_push(&in_line) = c;
300 static int is_ws(int c)
302 return (c == ' ' || c == '\t' || c == '\f');
305 static void csv_write(struct format *fmt)
307 for (int i=0; i < fields_count(&out_fields); i++) {
309 unsigned char *p = get_field(&out_fields, i, &len);
312 if (fmt->quote >= 0) {
313 need_quotes = fmt->always_quote;
314 for (int j=0; !need_quotes && j < len; j++) {
315 if (p[j] == fmt->fs || p[j] == fmt->quote)
320 putchar_unlocked(fmt->fs);
322 putchar_unlocked(fmt->quote);
323 for (int j=0; j < len; j++) {
325 if (c == fmt->fs && !need_quotes)
326 warn(fmt, "Field separator found inside field and quoting is turned off.");
332 putchar_unlocked(fmt->quote);
334 putchar_unlocked('\n');
337 /*** White-space back-end ***/
339 static int ws_read(struct format *fmt)
344 unsigned char *line = line_first(&in_line);
345 int n = line_count(&in_line);
351 for (int i=0; i<n; i++) {
357 if (!in_field->start_pos &&
360 in_field->start_pos = i;
369 if (ws && fmt->strict_ws)
374 /*** Regex back-end ***/
376 static const char *regex_set(struct format *f, char *rx)
380 f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
384 f->pcre_extra = pcre_study(f->pcre, 0, &err);
391 static int regex_read(struct format *fmt)
396 unsigned char *c = line_first(&in_line);
397 int n = line_count(&in_line);
404 int sep = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
406 if (sep != PCRE_ERROR_NOMATCH)
407 warn(fmt, "PCRE matching error %d", sep);
408 // No further occurrence of the separator: the rest is a single field
410 in_field->len = n - i;
414 in_field->len = ovec[0] - i;
419 /*** Table back-end ***/
421 static void table_write(struct format *fmt)
423 for (int i = 0; i < intarray_count(&column_widths); i++) {
424 if (fmt->table_grid) {
425 putchar_unlocked('|');
426 printf("%*s", fmt->table_sep / 2, "");
428 printf("%*s", fmt->table_sep, "");
430 int cw = *intarray_nth(&column_widths, i);
432 if (i < fields_count(&out_fields)) {
434 unsigned char *p = get_field(&out_fields, i, &len);
435 fw = field_chars(fields_nth(&out_fields, i));
437 warn(fmt, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
444 putchar_unlocked(' ');
449 printf("%*s", fmt->table_sep - fmt->table_sep / 2, "");
453 putchar_unlocked('|');
454 putchar_unlocked('\n');
457 static void table_write_grid(struct format *fmt, int pos UNUSED)
459 if (!fmt->table_grid)
462 for (int i = 0; i < intarray_count(&column_widths); i++) {
463 putchar_unlocked('+');
464 int w = fmt->table_sep + *intarray_nth(&column_widths, i);
468 putchar_unlocked('+');
469 putchar_unlocked('\n');
472 /*** Temporary file back-end ***/
474 static int tmp_read(struct format *fmt)
476 FILE *tf = fmt->tmp_file;
479 int c = getc_unlocked(tf);
485 c = getc_unlocked(tf);
486 c = (c << 8) | getc_unlocked(tf);
487 c = (c << 8) | getc_unlocked(tf);
488 c = (c << 8) | getc_unlocked(tf);
490 new_field(line_count(&in_line));
493 int x = getc_unlocked(tf);
495 warn(fmt, "Truncated temporary file");
498 *line_push(&in_line) = x;
502 if (ferror_unlocked(tf))
503 die("I/O error when reading temporary file");
506 static void tmp_write(struct format *fmt)
508 FILE *tf = fmt->tmp_file;
510 for (int i = 0; i < fields_count(&out_fields); i++) {
512 unsigned char *p = get_field(&out_fields, i, &len);
515 putc_unlocked(len, tf);
517 putc_unlocked(0xfe, tf);
518 putc_unlocked((len >> 24) & 0xff, tf);
519 putc_unlocked((len >> 16) & 0xff, tf);
520 putc_unlocked((len >> 8) & 0xff, tf);
521 putc_unlocked(len & 0xff, tf);
525 putc_unlocked(*p++, tf);
527 putc_unlocked(0xff, tf);
529 if (ferror_unlocked(tf))
530 die("I/O error when writing temporary file");
535 static void trim_fields(void)
537 unsigned char *line = line_first(&in_line);
538 for (int i = 0; i < fields_count(&in_fields); i++) {
539 struct field *f = fields_nth(&in_fields, i);
540 while (f->len && is_ws(line[f->start_pos]))
541 f->start_pos++, f->len--;
542 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
547 /*** Field names and headers ***/
553 static void add_field(struct field_names *fn, char *name, int namelen)
555 char *n = xmalloc(namelen + 1);
556 memcpy(n, name, namelen);
558 *stringarray_push(&fn->names) = n;
561 static void add_field_names(struct field_names *fn, char *names)
565 char *q = strchr(p, ',');
566 int len = q ? q-p : (int) strlen(p);
567 add_field(fn, p, len);
572 static void read_header(void)
574 if (!(in_format->has_header || in_format->set_field_names))
577 struct field_names *fn = xmalloc_zero(sizeof(*fn));
578 in_format->field_names = fn;
580 if (in_format->has_header) {
582 die("Missing input header");
585 if (in_format->set_field_names) {
586 add_field_names(fn, in_format->set_field_names);
588 for (int i = 0; i < fields_count(&in_fields); i++) {
590 char *s = (char *) get_field(&in_fields, i, &len);
591 add_field(fn, s, len);
596 static void write_header(void)
598 if (!out_format->has_header) {
603 if (out_format->set_field_names) {
604 struct field_names *fn = xmalloc_zero(sizeof(*fn));
605 out_format->field_names = fn;
606 add_field_names(fn, out_format->set_field_names);
607 } else if (in_format->field_names)
608 out_format->field_names = in_format->field_names;
610 die("Output header requested, but no field names specified");
612 line_reset(&in_line);
613 fields_reset(&out_fields);
614 struct field_names *fn = out_format->field_names;
615 for (int i = 0; i < stringarray_count(&fn->names); i++) {
616 struct field *f = fields_push(&out_fields);
617 f->start_pos = line_count(&in_line);
619 char *s = *stringarray_nth(&fn->names, i);
621 *line_push(&in_line) = *s++;
626 // This is tricky: when we are formatting a table, field names are normally
627 // calculated in pass 1, but the header is written in pass 2, so we have to
628 // update column statistics, because field name can be too wide to fit.
635 static void write_footer(void)
640 static int find_field_by_name(struct field_names *fn, char *name)
642 for (int i = 0; i < stringarray_count(&fn->names); i++)
643 if (!strcmp(*stringarray_nth(&fn->names, i), name))
648 /*** Field selection ***/
651 int first_field, last_field; // 0 means "boundary"
654 DECLARE_BUF(selectors, struct selector);
655 static selectors_t selectors;
657 static int parse_field_num(char *str)
662 if (*str < '0' || *str > '9')
666 f = 10*f + *str - '0';
672 static int parse_field(char *str)
677 int f = parse_field_num(str);
681 if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0)
684 die("Unknown field %s", str);
687 static char *parse_selector(char *str)
689 char buf[strlen(str) + 1];
692 struct selector *s = selectors_push(&selectors);
693 char *sep = strchr(buf, '-');
696 s->first_field = parse_field(buf);
697 s->last_field = parse_field(sep);
699 s->first_field = s->last_field = parse_field(buf);
704 static void finish_parse_selectors(void)
706 if (!selectors_count(&selectors))
710 static void select_fields(void)
712 for (int i = 0; i < selectors_count(&selectors); i++) {
713 struct selector *s = selectors_nth(&selectors, i);
714 int first = s->first_field;
717 int last = s->last_field;
719 last = fields_count(&in_fields);
720 for (int j = first; j <= last; j++) {
721 struct field *f = fields_push(&out_fields);
722 if (j >= 1 && j <= fields_count(&in_fields))
723 *f = *fields_nth(&in_fields, j-1);
725 f->start_pos = f->len = 0;
730 static void select_all_fields(void)
732 for (int i = 0; i < fields_count(&in_fields); i++)
733 *fields_push(&out_fields) = *fields_nth(&in_fields, i);
736 /*** Processing of files ***/
738 static void one_pass(int pass)
748 if (want_trim && (pass & 1))
751 fields_reset(&out_fields);
757 if (out_format->needs_stats)
767 static void two_pass(void)
769 struct format *final_format = out_format;
771 // We need to use character set info from the current locale
772 setlocale(LC_CTYPE, "");
774 // Pass 1: Set up writer of intermediate format
775 out_format = xmalloc_zero(sizeof(*out_format));
776 out_format->id = FORM_TMP;
777 out_format->read_line = tmp_read;
778 out_format->write_line = tmp_write;
779 out_format->tmp_file = tmpfile();
780 out_format->needs_stats = final_format->needs_stats;
781 out_format->field_names = in_format->field_names;
784 // Pass 2: Set up reader of intermediate format
785 in_format = out_format;
786 rewind(in_format->tmp_file);
788 out_format = final_format;
789 out_format->needs_stats = 0;
791 fclose(in_format->tmp_file);
794 /*** Parsing of arguments ***/
796 static void NONRET usage(void)
799 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
802 -t, --tsv TAB-separated values (default)\n\
803 -c, --csv Comma-separated values\n\
804 -w, --ws Values separated by arbitrary whitespace\n\
805 -W, --strict-ws Like --ws, but recognize empty columns at start/end\n\
806 -r, --regex=<rx> Separator given by Perl regular expression (input only)\n\
807 --table Format a table (output only)\n\
809 Format parameters:\n\
810 -d, --fs=<char> Delimiter of fields\n\
811 -f, --fields=<f>,... Set field names\n\
812 -h, --header The first line contains field names\n\
813 -q, --quiet Do not show warnings\n\
814 --always-quote Put quotes around all fields (CSV output only)\n\
815 --table-sep=<n> Separate table columns by <n> spaces (default: 2)\n\
816 --grid Separate table columns by grid lines\n\
819 --trim Trim leading and trailing whitespaces in fields\n\
824 static void NONRET bad_args(const char *msg, ...)
829 fprintf(stderr, "xsv: ");
830 vfprintf(stderr, msg, args);
834 fprintf(stderr, "Try `xsv --help' for more information.\n");
838 static const char short_options[] = "cd:f:hqr:twW";
849 static const struct option long_options[] = {
850 { "always-quote", 0, NULL, OPT_ALWAYS_QUOTE },
851 { "csv", 0, NULL, 'c' },
852 { "fields", 1, NULL, 'f' },
853 { "fs", 1, NULL, 'd' },
854 { "grid", 0, NULL, OPT_GRID },
855 { "header", 0, NULL, 'h' },
856 { "quiet", 0, NULL, 'q' },
857 { "regex", 1, NULL, 'r' },
858 { "strict-ws", 0, NULL, 'W' },
859 { "table", 0, NULL, OPT_TABLE },
860 { "table-sep", 1, NULL, OPT_TABLE_SEP },
861 { "trim", 0, NULL, OPT_TRIM },
862 { "tsv", 0, NULL, 't' },
863 { "ws", 0, NULL, 'w' },
864 { "help", 0, NULL, OPT_HELP },
865 { NULL, 0, NULL, 0 },
868 static void set_format(int format_id)
870 struct format *f = xmalloc_zero(sizeof(*f));
877 f->read_line = csv_read;
878 f->write_line = csv_write;
883 f->read_line = csv_read;
884 f->write_line = csv_write;
889 f->read_line = ws_read;
890 f->write_line = csv_write;
893 f->read_line = regex_read;
896 f->write_line = table_write;
897 f->write_grid = table_write_grid;
905 else if (!out_format)
908 bad_args("At most two formats may be given.");
911 static struct format *current_format(void)
917 set_format(FORM_TSV);
921 int main(int argc, char **argv)
926 while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
929 set_format(FORM_CSV);
933 current_format()->fs = optarg[0];
935 bad_args("No field delimiter given.");
938 current_format()->set_field_names = optarg;
941 current_format()->has_header = 1;
944 current_format()->quiet = 1;
947 set_format(FORM_REGEX);
948 err = regex_set(current_format(), optarg);
950 bad_args("Error compiling regex: %s", err);
953 set_format(FORM_TSV);
960 current_format()->strict_ws = 1;
962 case OPT_ALWAYS_QUOTE:
963 if (current_format()->id != FORM_CSV)
964 bad_args("--always-quote makes sense only for CSV.");
965 current_format()->always_quote = 1;
973 set_format(FORM_TABLE);
976 current_format()->table_sep = atoi(optarg);
979 current_format()->table_grid = 1;
987 out_format = in_format;
988 if (!in_format->read_line)
989 bad_args("Write-only format selected for input.");
990 if (!out_format->write_line)
991 bad_args("Read-only format selected for output.");
994 for (int i = optind; i < argc; i++) {
995 err = parse_selector(argv[i]);
999 finish_parse_selectors();
1001 if (out_format->needs_stats)