2 * A Swiss-Army Knife for CSV-like Files
4 * (c) 2012 Martin Mares <mj@ucw.cz>
20 #define NONRET __attribute__((noreturn))
25 /*** General functions ***/
27 static void NONRET die(char *msg, ...)
31 fprintf(stderr, "xsv: ");
32 vfprintf(stderr, msg, args);
38 /*** Memory allocation ***/
40 static void *xmalloc(size_t bytes)
42 void *p = malloc(bytes);
44 die("Out of memory (cannot allocate %zu bytes)", bytes);
48 static void *xmalloc_zero(size_t bytes)
50 void *p = xmalloc(bytes);
55 static void *xrealloc(void *old, size_t bytes)
57 void *p = realloc(old, bytes);
59 die("Out of memory (cannot allocate %zu bytes)", bytes);
63 #define DECLARE_BUF(name, type) \
64 typedef struct { type *start; int count; int max; } name##_t; \
65 static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
66 static inline void name##_reset(name##_t *b) { b->count = 0; } \
67 static inline int name##_count(name##_t *b) { return b->count; } \
68 static void name##_extend(name##_t *b) { \
69 b->max = b->max ? 2*b->max : 16; \
70 b->start = xrealloc(b->start, b->max * sizeof(type)); \
72 static inline type *name##_push(name##_t *b) { \
73 if (b->count >= b->max) name##_extend(b); \
74 return &b->start[b->count++]; \
76 static inline type *name##_first(name##_t *b) { return b->start; } \
77 static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \
80 DECLARE_BUF(intarray, int);
81 DECLARE_BUF(stringarray, char *);
83 /*** Formats and their parameters ***/
100 int (*read_line)(struct format *fmt);
101 void (*write_line)(struct format *fmt);
106 char *set_field_names;
107 struct field_names *field_names;
117 pcre_extra *pcre_extra;
119 // Temporary file backend:
126 static struct format *in_format, *out_format;
127 static int want_trim;
134 DECLARE_BUF(fields, struct field);
135 DECLARE_BUF(line, unsigned char);
137 static fields_t in_fields, out_fields;
138 static struct field *in_field;
139 static line_t in_line;
140 static int line_number;
142 static int read_line(void)
144 fields_reset(&in_fields);
145 line_reset(&in_line);
147 if (!in_format->read_line(in_format))
149 if (ferror_unlocked(stdin))
150 die("I/O error when reading standard input");
154 static void write_line(void)
156 out_format->write_line(out_format);
157 if (ferror_unlocked(stdout))
158 die("I/O error when writing standard input");
161 static void new_field(int pos)
163 in_field = fields_push(&in_fields);
164 in_field->start_pos = pos;
168 static void ensure_field(int pos)
174 // FIXME: Use elsewhere
175 static unsigned char *get_field(fields_t *fields, int i, int *len)
177 struct field *f = fields_nth(fields, i);
179 return line_nth(&in_line, f->start_pos);
182 static void warn(struct format *fmt, char *msg, ...)
185 fprintf(stderr, "Warning at line %d: ", line_number);
188 vfprintf(stderr, msg, args);
194 static int next_line(void)
197 int c = getchar_unlocked();
201 return !!line_count(&in_line);
204 *line_push(&in_line) = c;
208 static int field_chars(struct field *f)
210 unsigned char *s = line_nth(&in_line, f->start_pos);
213 memset(&mbs, 0, sizeof(mbs));
217 size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
227 /*** Field statistics ***/
229 static intarray_t column_widths;
231 static void update_stats(void)
233 for (int i = 0; i < fields_count(&out_fields); i++) {
234 struct field *f = fields_nth(&out_fields, i);
235 intarray_t *w = &column_widths;
237 while (i >= intarray_count(w))
238 *intarray_push(w) = 0;
239 int fw = field_chars(f);
240 if (*intarray_nth(w, i) < fw)
241 *intarray_nth(w, i) = fw;
245 /*** CSV/TSV back-end */
247 static int csv_read(struct format *fmt)
251 int c = getchar_unlocked();
252 int i = line_count(&in_line);
256 if (c < 0 || c == '\n') {
258 warn(fmt, "Missing closing quote.");
260 return !!fields_count(&in_fields);
265 if (c == fmt->quote) {
266 c = getchar_unlocked();
267 if (c != fmt->quote) {
271 // Two quotes assimilate to one
273 // Fall through to pushing the character
274 } else if (c == fmt->quote) {
277 } else if (c == fmt->fs && !quoted) {
283 *line_push(&in_line) = c;
288 static int is_ws(int c)
290 return (c == ' ' || c == '\t' || c == '\f');
293 static void csv_write(struct format *fmt)
295 unsigned char *line = line_first(&in_line);
296 int n = fields_count(&out_fields);
297 for (int i=0; i<n; i++) {
298 struct field *f = fields_nth(&out_fields, i);
300 if (fmt->quote >= 0) {
301 need_quotes = fmt->always_quote;
302 for (int j=0; !need_quotes && j < f->len; j++) {
303 int c = line[f->start_pos + j];
304 if (c == fmt->fs || c == fmt->quote)
309 putchar_unlocked(fmt->fs);
311 putchar_unlocked(fmt->quote);
312 for (int j=0; j < f->len; j++) {
313 int c = line[f->start_pos + j];
314 if (c == fmt->fs && !need_quotes)
315 warn(fmt, "Field separator found inside field and quoting is turned off.");
321 putchar_unlocked(fmt->quote);
323 putchar_unlocked('\n');
326 /*** White-space back-end ***/
328 static int ws_read(struct format *fmt)
333 unsigned char *line = line_first(&in_line);
334 int n = line_count(&in_line);
340 for (int i=0; i<n; i++) {
346 if (!in_field->start_pos &&
349 in_field->start_pos = i;
358 if (ws && fmt->strict_ws)
363 /*** Regex back-end ***/
365 static const char *regex_set(struct format *f, char *rx)
369 f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
373 f->pcre_extra = pcre_study(f->pcre, 0, &err);
380 static int regex_read(struct format *fmt)
385 unsigned char *c = line_first(&in_line);
386 int n = line_count(&in_line);
393 int sep = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
395 if (sep != PCRE_ERROR_NOMATCH)
396 warn(fmt, "PCRE matching error %d", sep);
397 // No further occurrence of the separator: the rest is a single field
399 in_field->len = n - i;
403 in_field->len = ovec[0] - i;
408 /*** Table back-end ***/
410 static void table_write(struct format *fmt)
412 for (int i = 0; i < fields_count(&out_fields); i++) {
414 printf("%*s", fmt->table_sep, "");
415 struct field *f = fields_nth(&out_fields, i);
416 int fw = field_chars(f);
417 int cw = *intarray_nth(&column_widths, i);
419 warn(fmt, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
422 unsigned char *p = line_nth(&in_line, f->start_pos);
423 for (int j = 0; j < f->len; j++)
424 putchar_unlocked(p[j]);
426 putchar_unlocked(' ');
430 putchar_unlocked('\n');
433 /*** Temporary file back-end ***/
435 static int tmp_read(struct format *fmt)
437 FILE *tf = fmt->tmp_file;
440 int c = getc_unlocked(tf);
446 c = getc_unlocked(tf);
447 c = (c << 8) | getc_unlocked(tf);
448 c = (c << 8) | getc_unlocked(tf);
449 c = (c << 8) | getc_unlocked(tf);
451 new_field(line_count(&in_line));
454 int x = getc_unlocked(tf);
456 warn(fmt, "Truncated temporary file");
459 *line_push(&in_line) = x;
463 if (ferror_unlocked(tf))
464 die("I/O error when reading temporary file");
467 static void tmp_write(struct format *fmt)
469 FILE *tf = fmt->tmp_file;
471 for (int i = 0; i < fields_count(&out_fields); i++) {
472 struct field *f = fields_nth(&out_fields, i);
474 putc_unlocked(f->len, tf);
476 putc_unlocked(0xfe, tf);
477 putc_unlocked((f->len >> 24) & 0xff, tf);
478 putc_unlocked((f->len >> 16) & 0xff, tf);
479 putc_unlocked((f->len >> 8) & 0xff, tf);
480 putc_unlocked(f->len & 0xff, tf);
483 unsigned char *p = line_nth(&in_line, f->start_pos);
484 for (int j = 0; j < f->len; j++)
485 putc_unlocked(*p++, tf);
487 putc_unlocked(0xff, tf);
489 if (ferror_unlocked(tf))
490 die("I/O error when writing temporary file");
495 static void trim_fields(void)
497 unsigned char *line = line_first(&in_line);
498 for (int i = 0; i < fields_count(&in_fields); i++) {
499 struct field *f = fields_nth(&in_fields, i);
500 while (f->len && is_ws(line[f->start_pos]))
501 f->start_pos++, f->len--;
502 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
507 /*** Field names and headers ***/
513 static void add_field(struct field_names *fn, char *name, int namelen)
515 char *n = xmalloc(namelen + 1);
516 memcpy(n, name, namelen);
518 *stringarray_push(&fn->names) = n;
521 static void add_field_names(struct field_names *fn, char *names)
525 char *q = strchr(p, ',');
526 int len = q ? q-p : (int) strlen(p);
527 add_field(fn, p, len);
532 static void read_header(void)
534 if (!(in_format->has_header || in_format->set_field_names))
537 struct field_names *fn = xmalloc_zero(sizeof(*fn));
538 in_format->field_names = fn;
540 if (in_format->has_header) {
542 die("Missing input header");
545 if (in_format->set_field_names) {
546 add_field_names(fn, in_format->set_field_names);
548 for (int i = 0; i < fields_count(&in_fields); i++) {
550 char *s = (char *) get_field(&in_fields, i, &len);
551 add_field(fn, s, len);
556 static void write_header(void)
558 if (!out_format->has_header)
561 if (out_format->set_field_names) {
562 struct field_names *fn = xmalloc_zero(sizeof(*fn));
563 out_format->field_names = fn;
564 add_field_names(fn, out_format->set_field_names);
565 } else if (in_format->field_names)
566 out_format->field_names = in_format->field_names;
568 die("Output header requested, but no field names specified");
570 line_reset(&in_line);
571 fields_reset(&out_fields);
572 struct field_names *fn = out_format->field_names;
573 for (int i = 0; i < stringarray_count(&fn->names); i++) {
574 struct field *f = fields_push(&out_fields);
575 f->start_pos = line_count(&in_line);
577 char *s = *stringarray_nth(&fn->names, i);
579 *line_push(&in_line) = *s++;
586 static int find_field_by_name(struct field_names *fn, char *name)
588 for (int i = 0; i < stringarray_count(&fn->names); i++)
589 if (!strcmp(*stringarray_nth(&fn->names, i), name))
594 /*** Field selection ***/
597 int first_field, last_field; // 0 means "boundary"
600 DECLARE_BUF(selectors, struct selector);
601 static selectors_t selectors;
603 static int parse_field_num(char *str)
608 if (*str < '0' || *str > '9')
612 f = 10*f + *str - '0';
618 static int parse_field(char *str)
623 int f = parse_field_num(str);
627 if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0)
630 die("Unknown field %s", str);
633 static char *parse_selector(char *str)
635 char buf[strlen(str) + 1];
638 struct selector *s = selectors_push(&selectors);
639 char *sep = strchr(buf, '-');
642 s->first_field = parse_field(buf);
643 s->last_field = parse_field(sep);
645 s->first_field = s->last_field = parse_field(buf);
650 static void finish_parse_selectors(void)
652 if (!selectors_count(&selectors))
656 static void select_fields(void)
658 for (int i = 0; i < selectors_count(&selectors); i++) {
659 struct selector *s = selectors_nth(&selectors, i);
660 int first = s->first_field;
663 int last = s->last_field;
665 last = fields_count(&in_fields);
666 for (int j = first; j <= last; j++) {
667 struct field *f = fields_push(&out_fields);
668 if (j >= 1 && j <= fields_count(&in_fields))
669 *f = *fields_nth(&in_fields, j-1);
671 f->start_pos = f->len = 0;
676 static void select_all_fields(void)
678 for (int i = 0; i < fields_count(&in_fields); i++)
679 *fields_push(&out_fields) = *fields_nth(&in_fields, i);
682 /*** Processing of files ***/
684 static void one_pass(int pass)
691 if (want_trim && (pass & 1))
694 fields_reset(&out_fields);
700 if (out_format->needs_stats)
707 static void two_pass(void)
709 struct format *final_format = out_format;
711 // We need to use character set info from the current locale
712 setlocale(LC_CTYPE, "");
714 // Pass 1: Set up writer of intermediate format
715 out_format = xmalloc_zero(sizeof(*out_format));
716 out_format->id = FORM_TMP;
717 out_format->read_line = tmp_read;
718 out_format->write_line = tmp_write;
719 out_format->tmp_file = tmpfile();
720 out_format->needs_stats = final_format->needs_stats;
723 // Pass 2: Set up reader of intermediate format
724 in_format = out_format;
725 rewind(in_format->tmp_file);
727 out_format = final_format;
728 out_format->needs_stats = 0;
730 fclose(in_format->tmp_file);
733 /*** Parsing of arguments ***/
735 static void NONRET usage(void)
738 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
741 -t, --tsv TAB-separated values (default)\n\
742 -c, --csv Comma-separated values\n\
743 -w, --ws Values separated by arbitrary whitespace\n\
744 -W, --strict-ws Like --ws, but recognize empty columns at start/end\n\
745 -r, --regex=<rx> Separator given by Perl regular expression (input only)\n\
746 --table Format a table (output only)\n\
748 Format parameters:\n\
749 -d, --fs=<char> Delimiter of fields\n\
750 -f, --fields=<f>,... Set field names\n\
751 -h, --header The first line contains field names\n\
752 -q, --quiet Do not show warnings\n\
753 --always-quote Put quotes around all fields (CSV output only)\n\
754 --table-sep=<n> Separate table columns by <n> spaces (default: 2)\n\
757 --trim Trim leading and trailing whitespaces in fields\n\
762 static void NONRET bad_args(const char *msg, ...)
767 fprintf(stderr, "xsv: ");
768 vfprintf(stderr, msg, args);
772 fprintf(stderr, "Try `xsv --help' for more information.\n");
776 static const char short_options[] = "cd:f:hqr:twW";
786 static const struct option long_options[] = {
787 { "always-quote", 0, NULL, OPT_ALWAYS_QUOTE },
788 { "csv", 0, NULL, 'c' },
789 { "fields", 1, NULL, 'f' },
790 { "fs", 1, NULL, 'd' },
791 { "header", 0, NULL, 'h' },
792 { "quiet", 0, NULL, 'q' },
793 { "regex", 1, NULL, 'r' },
794 { "strict-ws", 0, NULL, 'W' },
795 { "table", 0, NULL, OPT_TABLE },
796 { "table-sep", 1, NULL, OPT_TABLE_SEP },
797 { "trim", 0, NULL, OPT_TRIM },
798 { "tsv", 0, NULL, 't' },
799 { "ws", 0, NULL, 'w' },
800 { "help", 0, NULL, OPT_HELP },
801 { NULL, 0, NULL, 0 },
804 static void set_format(int format_id)
806 struct format *f = xmalloc_zero(sizeof(*f));
813 f->read_line = csv_read;
814 f->write_line = csv_write;
819 f->read_line = csv_read;
820 f->write_line = csv_write;
825 f->read_line = ws_read;
826 f->write_line = csv_write;
829 f->read_line = regex_read;
832 f->write_line = table_write;
840 else if (!out_format)
843 bad_args("At most two formats may be given.");
846 static struct format *current_format(void)
852 set_format(FORM_TSV);
856 int main(int argc, char **argv)
861 while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
864 set_format(FORM_CSV);
868 current_format()->fs = optarg[0];
870 bad_args("No field delimiter given.");
873 current_format()->set_field_names = optarg;
876 current_format()->has_header = 1;
879 current_format()->quiet = 1;
882 set_format(FORM_REGEX);
883 err = regex_set(current_format(), optarg);
885 bad_args("Error compiling regex: %s", err);
888 set_format(FORM_TSV);
895 current_format()->strict_ws = 1;
897 case OPT_ALWAYS_QUOTE:
898 if (current_format()->id != FORM_CSV)
899 bad_args("--always-quote makes sense only for CSV.");
900 current_format()->always_quote = 1;
908 set_format(FORM_TABLE);
911 current_format()->table_sep = atoi(optarg);
919 out_format = in_format;
920 if (!in_format->read_line)
921 bad_args("Write-only format selected for input.");
922 if (!out_format->write_line)
923 bad_args("Read-only format selected for output.");
926 for (int i = optind; i < argc; i++) {
927 err = parse_selector(argv[i]);
931 finish_parse_selectors();
934 if (out_format->needs_stats)