2 * A Swiss-Army Knife for CSV-like Files
4 * (c) 2012 Martin Mares <mj@ucw.cz>
15 /*** Memory allocation ***/
17 static void *xmalloc(size_t bytes)
19 void *p = malloc(bytes);
21 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
27 static void *xmalloc_zero(size_t bytes)
29 void *p = xmalloc(bytes);
34 static void *xrealloc(void *old, size_t bytes)
36 void *p = realloc(old, bytes);
38 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
44 #define DECLARE_BUF(name, type) \
45 typedef struct { type *start; int count; int max; } name##_t; \
46 static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
47 static inline void name##_reset(name##_t *b) { b->count = 0; } \
48 static inline int name##_count(name##_t *b) { return b->count; } \
49 static void name##_extend(name##_t *b) { \
50 b->max = b->max ? 2*b->max : 16; \
51 b->start = xrealloc(b->start, b->max * sizeof(type)); \
53 static inline type *name##_push(name##_t *b) { \
54 if (b->count >= b->max) name##_extend(b); \
55 return &b->start[b->count++]; \
57 static inline type *name##_first(name##_t *b) { return b->start; } \
58 static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \
61 DECLARE_BUF(intarray, int);
63 /*** Formats and their parameters ***/
80 int (*read_line)(void);
81 void (*write_line)(void);
92 pcre_extra *pcre_extra;
94 // Temporary file backend:
96 intarray_t column_widths;
102 static struct format *in_format, *out_format;
103 static int want_trim;
110 DECLARE_BUF(fields, struct field);
111 DECLARE_BUF(line, unsigned char);
113 static fields_t in_fields, out_fields;
114 static struct field *in_field;
115 static line_t in_line;
116 static int line_number;
118 static void new_field(int pos)
120 in_field = fields_push(&in_fields);
121 in_field->start_pos = pos;
125 static void ensure_field(int pos)
131 static void warn(struct format *fmt, char *msg, ...)
134 fprintf(stderr, "Warning at line %d: ", line_number);
137 vfprintf(stderr, args, msg);
143 static int next_line(void)
150 return !!line_count(&in_line);
153 *line_push(&in_line) = c;
157 /*** CSV/TSV back-end */
159 static int csv_read(void)
164 int i = line_count(&in_line);
168 if (c < 0 || c == '\n') {
170 warn(in_format, "Missing closing quote.");
172 return !!fields_count(&in_fields);
177 if (c == in_format->quote) {
179 if (c != in_format->quote) {
183 // Two quotes assimilate to one
185 // Fall through to pushing the character
186 } else if (c == in_format->quote) {
189 } else if (c == in_format->fs && !quoted) {
195 *line_push(&in_line) = c;
200 static int is_ws(int c)
202 return (c == ' ' || c == '\t' || c == '\f');
205 static void csv_write(void)
207 unsigned char *line = line_first(&in_line);
208 int n = fields_count(&out_fields);
209 for (int i=0; i<n; i++) {
210 struct field *f = fields_nth(&out_fields, i);
212 if (out_format->quote >= 0) {
213 need_quotes = out_format->always_quote;
214 for (int j=0; !need_quotes && j < f->len; j++) {
215 int c = line[f->start_pos + j];
216 if (c == out_format->fs || c == out_format->quote)
221 putchar(out_format->fs);
223 putchar(out_format->quote);
224 for (int j=0; j < f->len; j++) {
225 int c = line[f->start_pos + j];
226 if (c == out_format->fs && !need_quotes)
227 warn(out_format, "Field separator found inside field and quoting is turned off.");
228 if (c == out_format->quote)
233 putchar(out_format->quote);
238 /*** White-space back-end ***/
240 static int ws_read(void)
245 unsigned char *line = line_first(&in_line);
246 int n = line_count(&in_line);
252 for (int i=0; i<n; i++) {
258 if (!in_field->start_pos &&
260 !in_format->strict_ws)
261 in_field->start_pos = i;
270 if (ws && in_format->strict_ws)
275 /*** Regex back-end ***/
277 static const char *regex_set(struct format *f, char *rx)
281 f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
285 f->pcre_extra = pcre_study(f->pcre, 0, &err);
292 static int regex_read(void)
297 unsigned char *c = line_first(&in_line);
298 int n = line_count(&in_line);
305 int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3);
307 if (sep != PCRE_ERROR_NOMATCH)
308 warn(in_format, "PCRE matching error %d", sep);
309 // No further occurrence of the separator: the rest is a single field
311 in_field->len = n - i;
315 in_field->len = ovec[0] - i;
320 /*** Table back-end ***/
322 static void table_write(void)
324 for (int i = 0; i < fields_count(&in_fields); i++) {
326 printf("%*s", out_format->table_sep, "");
327 struct field *f = fields_nth(&in_fields, i);
328 int w = *intarray_nth(&in_format->column_widths, i);
330 warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", f->len, w);
334 unsigned char *p = line_nth(&in_line, f->start_pos);
347 /*** Temporary file back-end ***/
349 static int tmp_read(void)
351 FILE *tf = in_format->tmp_file;
361 c = (c << 8) | fgetc(tf);
362 c = (c << 8) | fgetc(tf);
363 c = (c << 8) | fgetc(tf);
365 new_field(line_count(&in_line));
370 warn(in_format, "Truncated temporary file");
373 *line_push(&in_line) = x;
378 static void tmp_write(void)
380 FILE *tf = out_format->tmp_file;
382 for (int i = 0; i < fields_count(&in_fields); i++) {
383 struct field *f = fields_nth(&in_fields, i);
388 fputc((f->len >> 24) & 0xff, tf);
389 fputc((f->len >> 16) & 0xff, tf);
390 fputc((f->len >> 8) & 0xff, tf);
391 fputc(f->len & 0xff, tf);
394 unsigned char *p = line_nth(&in_line, f->start_pos);
395 for (int j = 0; j < f->len; j++)
398 intarray_t *w = &out_format->column_widths;
399 while (i >= intarray_count(w))
400 *intarray_push(w) = 0;
401 if (*intarray_nth(w, i) < f->len)
402 *intarray_nth(w, i) = f->len;
409 static void trim_fields(void)
411 unsigned char *line = line_first(&in_line);
412 for (int i = 0; i < fields_count(&in_fields); i++) {
413 struct field *f = fields_nth(&in_fields, i);
414 while (f->len && is_ws(line[f->start_pos]))
415 f->start_pos++, f->len--;
416 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
421 /*** Field selection ***/
424 int first_field, last_field;
427 DECLARE_BUF(selectors, struct selector);
428 static selectors_t selectors;
430 static char *parse_selector(char *str)
432 char buf[strlen(str) + 1];
435 struct selector *s = selectors_push(&selectors);
436 char *sep = strchr(buf, '-');
439 s->first_field = atoi(buf);
440 s->last_field = atoi(sep);
442 s->first_field = s->last_field = atoi(buf);
447 static void finish_parse_selectors(void)
449 if (!selectors_count(&selectors))
453 static void select_fields(void)
455 for (int i = 0; i < selectors_count(&selectors); i++) {
456 struct selector *s = selectors_nth(&selectors, i);
457 int first = s->first_field;
460 int last = s->last_field;
462 last = fields_count(&in_fields);
463 for (int j = first; j <= last; j++) {
464 struct field *f = fields_push(&out_fields);
465 if (j >= 1 && j <= fields_count(&in_fields))
466 *f = *fields_nth(&in_fields, j-1);
468 f->start_pos = f->len = 0;
473 /*** Processing of files ***/
475 static void one_pass(void)
480 fields_reset(&in_fields);
481 line_reset(&in_line);
483 if (!in_format->read_line())
489 fields_reset(&out_fields);
492 out_format->write_line();
496 static void two_pass(void)
498 struct format *final_format = out_format;
500 // Pass 1: Set up writer of intermediate format
501 out_format = xmalloc_zero(sizeof(*out_format));
502 out_format->id = FORM_TMP;
503 out_format->read_line = tmp_read;
504 out_format->write_line = tmp_write;
505 out_format->tmp_file = tmpfile();
506 intarray_init(&out_format->column_widths);
509 // Pass 2: Set up reader of intermediate format
510 in_format = out_format;
511 rewind(in_format->tmp_file);
512 out_format = final_format;
514 fclose(in_format->tmp_file);
517 /*** Parsing of arguments ***/
519 static void usage(void)
522 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
525 -t, --tsv TAB-separated values (default)\n\
526 -c, --csv Comma-separated values\n\
527 -w, --ws Values separated by arbitrary whitespace\n\
528 -W, --strict-ws Like --ws, but recognize empty columns at start/end\n\
529 -r, --regex=<rx> Separator given by Perl regular expression (input only)\n\
530 --table Format a table (output only)\n\
532 Format parameters:\n\
533 -d, --fs=<char> Delimiter of fields\n\
534 -q, --quiet Do not show warnings\n\
535 --always-quote Put quotes around all fields (CSV output only)\n\
536 --table-sep=<n> Separate table columns by <n> spaces (default: 2)\n\
539 --trim Trim leading and trailing whitespaces in fields\n\
544 static void bad_args(const char *msg, ...)
549 fprintf(stderr, "xsv: ");
550 vfprintf(stderr, msg, args);
554 fprintf(stderr, "Try `xsv --help' for more information.\n");
558 static const char short_options[] = "cd:qr:twW";
568 static const struct option long_options[] = {
569 { "always-quote", 0, NULL, OPT_ALWAYS_QUOTE },
570 { "csv", 0, NULL, 'c' },
571 { "fs", 1, NULL, 'd' },
572 { "quiet", 0, NULL, 'q' },
573 { "regex", 1, NULL, 'r' },
574 { "strict-ws", 0, NULL, 'W' },
575 { "table", 0, NULL, OPT_TABLE },
576 { "table-sep", 1, NULL, OPT_TABLE_SEP },
577 { "trim", 0, NULL, OPT_TRIM },
578 { "tsv", 0, NULL, 't' },
579 { "ws", 0, NULL, 'w' },
580 { "help", 0, NULL, OPT_HELP },
581 { NULL, 0, NULL, 0 },
584 static void set_format(int format_id)
586 struct format *f = xmalloc_zero(sizeof(*f));
593 f->read_line = csv_read;
594 f->write_line = csv_write;
599 f->read_line = csv_read;
600 f->write_line = csv_write;
605 f->read_line = ws_read;
606 f->write_line = csv_write;
609 f->read_line = regex_read;
612 f->write_line = table_write;
613 f->needs_two_passes = 1;
620 else if (!out_format)
623 bad_args("At most two formats may be given.");
626 static struct format *current_format(void)
632 set_format(FORM_TSV);
636 int main(int argc, char **argv)
641 while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
644 set_format(FORM_CSV);
648 current_format()->fs = optarg[0];
650 bad_args("No field delimiter given.");
653 current_format()->quiet = 1;
656 set_format(FORM_REGEX);
657 err = regex_set(current_format(), optarg);
659 bad_args("Error compiling regex: %s", err);
662 set_format(FORM_TSV);
669 current_format()->strict_ws = 1;
671 case OPT_ALWAYS_QUOTE:
672 if (current_format()->id != FORM_CSV)
673 bad_args("--always-quote makes sense only for CSV.");
674 current_format()->always_quote = 1;
682 set_format(FORM_TABLE);
685 current_format()->table_sep = atoi(optarg);
693 out_format = in_format;
694 if (!in_format->read_line)
695 bad_args("Write-only format selected for input.");
696 if (!out_format->write_line)
697 bad_args("Read-only format selected for output.");
699 for (int i = optind; i < argc; i++) {
700 err = parse_selector(argv[i]);
704 finish_parse_selectors();
706 fields_init(&in_fields);
707 fields_init(&out_fields);
710 if (out_format->needs_two_passes)