2 * A Swiss-Army Knife for CSV-like Files
4 * (c) 2012 Martin Mares <mj@ucw.cz>
20 #define NONRET __attribute__((noreturn))
25 /*** General functions ***/
27 static void NONRET die(char *msg, ...)
31 fprintf(stderr, "xsv: ");
32 vfprintf(stderr, msg, args);
38 /*** Memory allocation ***/
40 static void *xmalloc(size_t bytes)
42 void *p = malloc(bytes);
44 die("Out of memory (cannot allocate %zu bytes)", bytes);
48 static void *xmalloc_zero(size_t bytes)
50 void *p = xmalloc(bytes);
55 static void *xrealloc(void *old, size_t bytes)
57 void *p = realloc(old, bytes);
59 die("Out of memory (cannot allocate %zu bytes)", bytes);
63 #define DECLARE_BUF(name, type) \
64 typedef struct { type *start; int count; int max; } name##_t; \
65 static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
66 static inline void name##_reset(name##_t *b) { b->count = 0; } \
67 static inline int name##_count(name##_t *b) { return b->count; } \
68 static void name##_extend(name##_t *b) { \
69 b->max = b->max ? 2*b->max : 16; \
70 b->start = xrealloc(b->start, b->max * sizeof(type)); \
72 static inline type *name##_push(name##_t *b) { \
73 if (b->count >= b->max) name##_extend(b); \
74 return &b->start[b->count++]; \
76 static inline type *name##_first(name##_t *b) { return b->start; } \
77 static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \
80 DECLARE_BUF(intarray, int);
82 /*** Formats and their parameters ***/
99 int (*read_line)(struct format *fmt);
100 void (*write_line)(struct format *fmt);
111 pcre_extra *pcre_extra;
113 // Temporary file backend:
120 static struct format *in_format, *out_format;
121 static int want_trim;
128 DECLARE_BUF(fields, struct field);
129 DECLARE_BUF(line, unsigned char);
131 static fields_t in_fields, out_fields;
132 static struct field *in_field;
133 static line_t in_line;
134 static int line_number;
136 static void new_field(int pos)
138 in_field = fields_push(&in_fields);
139 in_field->start_pos = pos;
143 static void ensure_field(int pos)
149 static void warn(struct format *fmt, char *msg, ...)
152 fprintf(stderr, "Warning at line %d: ", line_number);
155 vfprintf(stderr, msg, args);
161 static int next_line(void)
164 int c = getchar_unlocked();
168 return !!line_count(&in_line);
171 *line_push(&in_line) = c;
175 static int field_chars(struct field *f)
177 unsigned char *s = line_nth(&in_line, f->start_pos);
180 memset(&mbs, 0, sizeof(mbs));
184 size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
194 /*** Field statistics ***/
196 static intarray_t column_widths;
198 static void update_stats(void)
200 for (int i = 0; i < fields_count(&in_fields); i++) {
201 struct field *f = fields_nth(&in_fields, i);
202 intarray_t *w = &column_widths;
204 while (i >= intarray_count(w))
205 *intarray_push(w) = 0;
206 int fw = field_chars(f);
207 if (*intarray_nth(w, i) < fw)
208 *intarray_nth(w, i) = fw;
212 /*** CSV/TSV back-end */
214 static int csv_read(struct format *fmt)
218 int c = getchar_unlocked();
219 int i = line_count(&in_line);
223 if (c < 0 || c == '\n') {
225 warn(fmt, "Missing closing quote.");
227 return !!fields_count(&in_fields);
232 if (c == fmt->quote) {
233 c = getchar_unlocked();
234 if (c != fmt->quote) {
238 // Two quotes assimilate to one
240 // Fall through to pushing the character
241 } else if (c == fmt->quote) {
244 } else if (c == fmt->fs && !quoted) {
250 *line_push(&in_line) = c;
255 static int is_ws(int c)
257 return (c == ' ' || c == '\t' || c == '\f');
260 static void csv_write(struct format *fmt)
262 unsigned char *line = line_first(&in_line);
263 int n = fields_count(&out_fields);
264 for (int i=0; i<n; i++) {
265 struct field *f = fields_nth(&out_fields, i);
267 if (fmt->quote >= 0) {
268 need_quotes = fmt->always_quote;
269 for (int j=0; !need_quotes && j < f->len; j++) {
270 int c = line[f->start_pos + j];
271 if (c == fmt->fs || c == fmt->quote)
276 putchar_unlocked(fmt->fs);
278 putchar_unlocked(fmt->quote);
279 for (int j=0; j < f->len; j++) {
280 int c = line[f->start_pos + j];
281 if (c == fmt->fs && !need_quotes)
282 warn(fmt, "Field separator found inside field and quoting is turned off.");
288 putchar_unlocked(fmt->quote);
290 putchar_unlocked('\n');
293 /*** White-space back-end ***/
295 static int ws_read(struct format *fmt)
300 unsigned char *line = line_first(&in_line);
301 int n = line_count(&in_line);
307 for (int i=0; i<n; i++) {
313 if (!in_field->start_pos &&
316 in_field->start_pos = i;
325 if (ws && fmt->strict_ws)
330 /*** Regex back-end ***/
332 static const char *regex_set(struct format *f, char *rx)
336 f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
340 f->pcre_extra = pcre_study(f->pcre, 0, &err);
347 static int regex_read(struct format *fmt)
352 unsigned char *c = line_first(&in_line);
353 int n = line_count(&in_line);
360 int sep = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
362 if (sep != PCRE_ERROR_NOMATCH)
363 warn(fmt, "PCRE matching error %d", sep);
364 // No further occurrence of the separator: the rest is a single field
366 in_field->len = n - i;
370 in_field->len = ovec[0] - i;
375 /*** Table back-end ***/
377 static void table_write(struct format *fmt)
379 for (int i = 0; i < fields_count(&in_fields); i++) {
381 printf("%*s", fmt->table_sep, "");
382 struct field *f = fields_nth(&in_fields, i);
383 int fw = field_chars(f);
384 int cw = *intarray_nth(&column_widths, i);
386 warn(fmt, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
389 unsigned char *p = line_nth(&in_line, f->start_pos);
390 for (int j = 0; j < f->len; j++)
391 putchar_unlocked(p[j]);
393 putchar_unlocked(' ');
397 putchar_unlocked('\n');
400 /*** Temporary file back-end ***/
402 static int tmp_read(struct format *fmt)
404 FILE *tf = fmt->tmp_file;
407 int c = getc_unlocked(tf);
413 c = getc_unlocked(tf);
414 c = (c << 8) | getc_unlocked(tf);
415 c = (c << 8) | getc_unlocked(tf);
416 c = (c << 8) | getc_unlocked(tf);
418 new_field(line_count(&in_line));
421 int x = getc_unlocked(tf);
423 warn(fmt, "Truncated temporary file");
426 *line_push(&in_line) = x;
430 if (ferror_unlocked(tf))
431 die("I/O error when reading temporary file");
434 static void tmp_write(struct format *fmt)
436 FILE *tf = fmt->tmp_file;
438 for (int i = 0; i < fields_count(&in_fields); i++) {
439 struct field *f = fields_nth(&in_fields, i);
441 putc_unlocked(f->len, tf);
443 putc_unlocked(0xfe, tf);
444 putc_unlocked((f->len >> 24) & 0xff, tf);
445 putc_unlocked((f->len >> 16) & 0xff, tf);
446 putc_unlocked((f->len >> 8) & 0xff, tf);
447 putc_unlocked(f->len & 0xff, tf);
450 unsigned char *p = line_nth(&in_line, f->start_pos);
451 for (int j = 0; j < f->len; j++)
452 putc_unlocked(*p++, tf);
454 putc_unlocked(0xff, tf);
456 if (ferror_unlocked(tf))
457 die("I/O error when writing temporary file");
462 static void trim_fields(void)
464 unsigned char *line = line_first(&in_line);
465 for (int i = 0; i < fields_count(&in_fields); i++) {
466 struct field *f = fields_nth(&in_fields, i);
467 while (f->len && is_ws(line[f->start_pos]))
468 f->start_pos++, f->len--;
469 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
474 /*** Field selection ***/
477 int first_field, last_field;
480 DECLARE_BUF(selectors, struct selector);
481 static selectors_t selectors;
483 static char *parse_selector(char *str)
485 char buf[strlen(str) + 1];
488 struct selector *s = selectors_push(&selectors);
489 char *sep = strchr(buf, '-');
492 s->first_field = atoi(buf);
493 s->last_field = atoi(sep);
495 s->first_field = s->last_field = atoi(buf);
500 static void finish_parse_selectors(void)
502 if (!selectors_count(&selectors))
506 static void select_fields(void)
508 for (int i = 0; i < selectors_count(&selectors); i++) {
509 struct selector *s = selectors_nth(&selectors, i);
510 int first = s->first_field;
513 int last = s->last_field;
515 last = fields_count(&in_fields);
516 for (int j = first; j <= last; j++) {
517 struct field *f = fields_push(&out_fields);
518 if (j >= 1 && j <= fields_count(&in_fields))
519 *f = *fields_nth(&in_fields, j-1);
521 f->start_pos = f->len = 0;
526 /*** Processing of files ***/
528 static void one_pass(void)
533 fields_reset(&in_fields);
534 line_reset(&in_line);
536 if (!in_format->read_line(in_format))
538 if (ferror_unlocked(stdin))
539 die("I/O error when reading standard input");
544 fields_reset(&out_fields);
547 if (out_format->needs_stats)
549 out_format->write_line(out_format);
550 if (ferror_unlocked(stdout))
551 die("I/O error when writing standard input");
555 static void two_pass(void)
557 struct format *final_format = out_format;
559 // We need to use character set info from the current locale
560 setlocale(LC_CTYPE, "");
562 // Pass 1: Set up writer of intermediate format
563 out_format = xmalloc_zero(sizeof(*out_format));
564 out_format->id = FORM_TMP;
565 out_format->read_line = tmp_read;
566 out_format->write_line = tmp_write;
567 out_format->tmp_file = tmpfile();
568 out_format->needs_stats = final_format->needs_stats;
571 // Pass 2: Set up reader of intermediate format
572 in_format = out_format;
573 rewind(in_format->tmp_file);
574 out_format = final_format;
575 out_format->needs_stats = 0;
577 fclose(in_format->tmp_file);
580 /*** Parsing of arguments ***/
582 static void NONRET usage(void)
585 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
588 -t, --tsv TAB-separated values (default)\n\
589 -c, --csv Comma-separated values\n\
590 -w, --ws Values separated by arbitrary whitespace\n\
591 -W, --strict-ws Like --ws, but recognize empty columns at start/end\n\
592 -r, --regex=<rx> Separator given by Perl regular expression (input only)\n\
593 --table Format a table (output only)\n\
595 Format parameters:\n\
596 -d, --fs=<char> Delimiter of fields\n\
597 -q, --quiet Do not show warnings\n\
598 --always-quote Put quotes around all fields (CSV output only)\n\
599 --table-sep=<n> Separate table columns by <n> spaces (default: 2)\n\
602 --trim Trim leading and trailing whitespaces in fields\n\
607 static void NONRET bad_args(const char *msg, ...)
612 fprintf(stderr, "xsv: ");
613 vfprintf(stderr, msg, args);
617 fprintf(stderr, "Try `xsv --help' for more information.\n");
621 static const char short_options[] = "cd:qr:twW";
631 static const struct option long_options[] = {
632 { "always-quote", 0, NULL, OPT_ALWAYS_QUOTE },
633 { "csv", 0, NULL, 'c' },
634 { "fs", 1, NULL, 'd' },
635 { "quiet", 0, NULL, 'q' },
636 { "regex", 1, NULL, 'r' },
637 { "strict-ws", 0, NULL, 'W' },
638 { "table", 0, NULL, OPT_TABLE },
639 { "table-sep", 1, NULL, OPT_TABLE_SEP },
640 { "trim", 0, NULL, OPT_TRIM },
641 { "tsv", 0, NULL, 't' },
642 { "ws", 0, NULL, 'w' },
643 { "help", 0, NULL, OPT_HELP },
644 { NULL, 0, NULL, 0 },
647 static void set_format(int format_id)
649 struct format *f = xmalloc_zero(sizeof(*f));
656 f->read_line = csv_read;
657 f->write_line = csv_write;
662 f->read_line = csv_read;
663 f->write_line = csv_write;
668 f->read_line = ws_read;
669 f->write_line = csv_write;
672 f->read_line = regex_read;
675 f->write_line = table_write;
683 else if (!out_format)
686 bad_args("At most two formats may be given.");
689 static struct format *current_format(void)
695 set_format(FORM_TSV);
699 int main(int argc, char **argv)
704 while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
707 set_format(FORM_CSV);
711 current_format()->fs = optarg[0];
713 bad_args("No field delimiter given.");
716 current_format()->quiet = 1;
719 set_format(FORM_REGEX);
720 err = regex_set(current_format(), optarg);
722 bad_args("Error compiling regex: %s", err);
725 set_format(FORM_TSV);
732 current_format()->strict_ws = 1;
734 case OPT_ALWAYS_QUOTE:
735 if (current_format()->id != FORM_CSV)
736 bad_args("--always-quote makes sense only for CSV.");
737 current_format()->always_quote = 1;
745 set_format(FORM_TABLE);
748 current_format()->table_sep = atoi(optarg);
756 out_format = in_format;
757 if (!in_format->read_line)
758 bad_args("Write-only format selected for input.");
759 if (!out_format->write_line)
760 bad_args("Read-only format selected for output.");
762 for (int i = optind; i < argc; i++) {
763 err = parse_selector(argv[i]);
767 finish_parse_selectors();
769 if (out_format->needs_stats)