]> mj.ucw.cz Git - xsv.git/blob - xsv.c
56f558369089eebfd6e0eb606062587dd6323dd4
[xsv.git] / xsv.c
1 /*
2  *      A Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #define _GNU_SOURCE
8
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdarg.h>
13 #include <getopt.h>
14 #include <wchar.h>
15 #include <locale.h>
16
17 #include <pcre.h>
18
19 #ifdef __GNUC__
20 #define NONRET __attribute__((noreturn))
21 #else
22 #define NONRET
23 #endif
24
25 /*** General functions ***/
26
27 static void NONRET die(char *msg, ...)
28 {
29         va_list args;
30         va_start(args, msg);
31         fprintf(stderr, "xsv: ");
32         vfprintf(stderr, msg, args);
33         fputc('\n', stderr);
34         va_end(args);
35         exit(1);
36 }
37
38 /*** Memory allocation ***/
39
40 static void *xmalloc(size_t bytes)
41 {
42         void *p = malloc(bytes);
43         if (!p)
44                 die("Out of memory (cannot allocate %zu bytes)", bytes);
45         return p;
46 }
47
48 static void *xmalloc_zero(size_t bytes)
49 {
50         void *p = xmalloc(bytes);
51         memset(p, 0, bytes);
52         return p;
53 }
54
55 static void *xrealloc(void *old, size_t bytes)
56 {
57         void *p = realloc(old, bytes);
58         if (!p)
59                 die("Out of memory (cannot allocate %zu bytes)", bytes);
60         return p;
61 }
62
63 #define DECLARE_BUF(name, type) \
64         typedef struct { type *start; int count; int max; } name##_t;                           \
65         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
66         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
67         static inline int name##_count(name##_t *b) { return b->count; }                        \
68         static void name##_extend(name##_t *b) {                                                \
69                 b->max = b->max ? 2*b->max : 16;                                                \
70                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
71         }                                                                                       \
72         static inline type *name##_push(name##_t *b) {                                          \
73                 if (b->count >= b->max) name##_extend(b);                                       \
74                 return &b->start[b->count++];                                                   \
75         }                                                                                       \
76         static inline type *name##_first(name##_t *b) { return b->start; }                      \
77         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
78         // end
79
80 DECLARE_BUF(intarray, int);
81
82 /*** Formats and their parameters ***/
83
84 enum format_id {
85         FORM_UNSPEC,
86         FORM_TSV,
87         FORM_CSV,
88         FORM_WS,
89         FORM_REGEX,
90         FORM_TMP,
91         FORM_TABLE,
92 };
93
94 struct format {
95         enum format_id id;
96         int fs;
97         int quote;
98         int quiet;
99         int (*read_line)(void);
100         void (*write_line)(void);
101         int needs_two_passes;
102
103         // CSV backend:
104         int always_quote;
105
106         // WS backend:
107         int strict_ws;
108
109         // regex backend:
110         pcre *pcre;
111         pcre_extra *pcre_extra;
112
113         // Temporary file backend:
114         FILE *tmp_file;
115         intarray_t column_widths;
116
117         // Table backend:
118         int table_sep;
119 };
120
121 static struct format *in_format, *out_format;
122 static int want_trim;
123
124 struct field {
125         int start_pos;
126         int len;
127 };
128
129 DECLARE_BUF(fields, struct field);
130 DECLARE_BUF(line, unsigned char);
131
132 static fields_t in_fields, out_fields;
133 static struct field *in_field;
134 static line_t in_line;
135 static int line_number;
136
137 static void new_field(int pos)
138 {
139         in_field = fields_push(&in_fields);
140         in_field->start_pos = pos;
141         in_field->len = 0;
142 }
143
144 static void ensure_field(int pos)
145 {
146         if (!in_field)
147                 new_field(pos);
148 }
149
150 static void warn(struct format *fmt, char *msg, ...)
151 {
152         if (!fmt->quiet) {
153                 fprintf(stderr, "Warning at line %d: ", line_number);
154                 va_list args;
155                 va_start(args, msg);
156                 vfprintf(stderr, msg, args);
157                 va_end(args);
158                 fputc('\n', stderr);
159         }
160 }
161
162 static int next_line(void)
163 {
164         for (;;) {
165                 int c = getchar_unlocked();
166                 if (c == '\r')
167                         continue;
168                 if (c < 0)
169                         return !!line_count(&in_line);
170                 if (c == '\n')
171                         return 1;
172                 *line_push(&in_line) = c;
173         }
174 }
175
176 static int field_chars(struct field *f)
177 {
178         unsigned char *s = line_nth(&in_line, f->start_pos);
179         int i = 0;
180         mbstate_t mbs;
181         memset(&mbs, 0, sizeof(mbs));
182
183         int chars = 0;
184         while (i < f->len) {
185                 size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
186                 if ((int) k <= 0)
187                         break;
188                 i += k;
189                 chars++;
190         }
191
192         return chars;
193 }
194
195 /*** CSV/TSV back-end */
196
197 static int csv_read(void)
198 {
199         int quoted = 0;
200         for (;;) {
201                 int c = getchar_unlocked();
202                 int i = line_count(&in_line);
203 restart:
204                 if (c == '\r')
205                         continue;
206                 if (c < 0 || c == '\n') {
207                         if (quoted)
208                                 warn(in_format, "Missing closing quote.");
209                         if (c < 0)
210                                 return !!fields_count(&in_fields);
211                         else
212                                 return 1;
213                 }
214                 if (quoted) {
215                         if (c == in_format->quote) {
216                                 c = getchar_unlocked();
217                                 if (c != in_format->quote) {
218                                         quoted = 0;
219                                         goto restart;
220                                 }
221                                 // Two quotes assimilate to one
222                         }
223                         // Fall through to pushing the character
224                 } else if (c == in_format->quote) {
225                         quoted = 1;
226                         continue;
227                 } else if (c == in_format->fs && !quoted) {
228                         ensure_field(i);
229                         new_field(i);
230                         continue;
231                 }
232                 ensure_field(i);
233                 *line_push(&in_line) = c;
234                 in_field->len++;
235         }
236 }
237
238 static int is_ws(int c)
239 {
240         return (c == ' ' || c == '\t' || c == '\f');
241 }
242
243 static void csv_write(void)
244 {
245         unsigned char *line = line_first(&in_line);
246         int n = fields_count(&out_fields);
247         for (int i=0; i<n; i++) {
248                 struct field *f = fields_nth(&out_fields, i);
249                 int need_quotes = 0;
250                 if (out_format->quote >= 0) {
251                         need_quotes = out_format->always_quote;
252                         for (int j=0; !need_quotes && j < f->len; j++) {
253                                 int c = line[f->start_pos + j];
254                                 if (c == out_format->fs || c == out_format->quote)
255                                         need_quotes = 1;
256                         }
257                 }
258                 if (i)
259                         putchar_unlocked(out_format->fs);
260                 if (need_quotes)
261                         putchar_unlocked(out_format->quote);
262                 for (int j=0; j < f->len; j++) {
263                         int c = line[f->start_pos + j];
264                         if (c == out_format->fs && !need_quotes)
265                                 warn(out_format, "Field separator found inside field and quoting is turned off.");
266                         if (c == out_format->quote)
267                                 putchar_unlocked(c);
268                         putchar_unlocked(c);
269                 }
270                 if (need_quotes)
271                         putchar_unlocked(out_format->quote);
272         }
273         putchar_unlocked('\n');
274 }
275
276 /*** White-space back-end ***/
277
278 static int ws_read(void)
279 {
280         if (!next_line())
281                 return 0;
282
283         unsigned char *line = line_first(&in_line);
284         int n = line_count(&in_line);
285         if (!n)
286                 return 1;
287
288         int ws = 0;
289         new_field(0);
290         for (int i=0; i<n; i++) {
291                 int c = line[i];
292                 if (is_ws(c)) {
293                         ws++;
294                 } else {
295                         if (ws) {
296                                 if (!in_field->start_pos &&
297                                     !in_field->len &&
298                                     !in_format->strict_ws)
299                                         in_field->start_pos = i;
300                                 else
301                                         new_field(i);
302                                 ws = 0;
303                         }
304                         in_field->len++;
305                 }
306         }
307
308         if (ws && in_format->strict_ws)
309                 new_field(n);
310         return 1;
311 }
312
313 /*** Regex back-end ***/
314
315 static const char *regex_set(struct format *f, char *rx)
316 {
317         const char *err;
318         int errpos;
319         f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
320         if (!f->pcre)
321                 return err;
322
323         f->pcre_extra = pcre_study(f->pcre, 0, &err);
324         if (!f->pcre_extra)
325                 return err;
326
327         return NULL;
328 }
329
330 static int regex_read(void)
331 {
332         if (!next_line())
333                 return 0;
334
335         unsigned char *c = line_first(&in_line);
336         int n = line_count(&in_line);
337         if (!n)
338                 return 1;
339
340         int i = 0;
341         for (;;) {
342                 int ovec[3];
343                 int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3);
344                 if (sep < 0) {
345                         if (sep != PCRE_ERROR_NOMATCH)
346                                 warn(in_format, "PCRE matching error %d", sep);
347                         // No further occurrence of the separator: the rest is a single field
348                         new_field(i);
349                         in_field->len = n - i;
350                         return 1;
351                 }
352                 new_field(i);
353                 in_field->len = ovec[0] - i;
354                 i = ovec[1];
355         }
356 }
357
358 /*** Table back-end ***/
359
360 static void table_write(void)
361 {
362         for (int i = 0; i < fields_count(&in_fields); i++) {
363                 if (i)
364                         printf("%*s", out_format->table_sep, "");
365                 struct field *f = fields_nth(&in_fields, i);
366                 int fw = field_chars(f);
367                 int cw = *intarray_nth(&in_format->column_widths, i);
368                 if (fw > cw) {
369                         warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
370                         cw = fw;
371                 }
372                 unsigned char *p = line_nth(&in_line, f->start_pos);
373                 for (int j = 0; j < f->len; j++)
374                         putchar_unlocked(p[j]);
375                 while (fw < cw) {
376                         putchar_unlocked(' ');
377                         fw++;
378                 }
379         }
380         putchar_unlocked('\n');
381 }
382
383 /*** Temporary file back-end ***/
384
385 static int tmp_read(void)
386 {
387         FILE *tf = in_format->tmp_file;
388
389         for (;;) {
390                 int c = getc_unlocked(tf);
391                 if (c < 0)
392                         return 0;
393                 if (c == 0xff)
394                         return 1;
395                 if (c == 0xfe) {
396                         c = getc_unlocked(tf);
397                         c = (c << 8) | getc_unlocked(tf);
398                         c = (c << 8) | getc_unlocked(tf);
399                         c = (c << 8) | getc_unlocked(tf);
400                 }
401                 new_field(line_count(&in_line));
402                 in_field->len = c;
403                 while (c--) {
404                         int x = getc_unlocked(tf);
405                         if (x < 0) {
406                                 warn(in_format, "Truncated temporary file");
407                                 return 0;
408                         }
409                         *line_push(&in_line) = x;
410                 }
411         }
412
413         if (ferror_unlocked(tf))
414                 die("I/O error when reading temporary file");
415 }
416
417 static void tmp_write(void)
418 {
419         FILE *tf = out_format->tmp_file;
420
421         for (int i = 0; i < fields_count(&in_fields); i++) {
422                 struct field *f = fields_nth(&in_fields, i);
423                 if (f->len < 0xfe)
424                         putc_unlocked(f->len, tf);
425                 else {
426                         putc_unlocked(0xfe, tf);
427                         putc_unlocked((f->len >> 24) & 0xff, tf);
428                         putc_unlocked((f->len >> 16) & 0xff, tf);
429                         putc_unlocked((f->len >> 8) & 0xff, tf);
430                         putc_unlocked(f->len & 0xff, tf);
431                 }
432
433                 unsigned char *p = line_nth(&in_line, f->start_pos);
434                 for (int j = 0; j < f->len; j++)
435                         putc_unlocked(*p++, tf);
436
437                 intarray_t *w = &out_format->column_widths;
438                 while (i >= intarray_count(w))
439                         *intarray_push(w) = 0;
440                 int fw = field_chars(f);
441                 if (*intarray_nth(w, i) < fw)
442                         *intarray_nth(w, i) = fw;
443         }
444         putc_unlocked(0xff, tf);
445
446         if (ferror_unlocked(tf))
447                 die("I/O error when writing temporary file");
448 }
449
450 /*** Transforms ***/
451
452 static void trim_fields(void)
453 {
454         unsigned char *line = line_first(&in_line);
455         for (int i = 0; i < fields_count(&in_fields); i++) {
456                 struct field *f = fields_nth(&in_fields, i);
457                 while (f->len && is_ws(line[f->start_pos]))
458                         f->start_pos++, f->len--;
459                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
460                         f->len--;
461         }
462 }
463
464 /*** Field selection ***/
465
466 struct selector {
467         int first_field, last_field;
468 };
469
470 DECLARE_BUF(selectors, struct selector);
471 static selectors_t selectors;
472
473 static char *parse_selector(char *str)
474 {
475         char buf[strlen(str) + 1];
476         strcpy(buf, str);
477
478         struct selector *s = selectors_push(&selectors);
479         char *sep = strchr(buf, '-');
480         if (sep) {
481                 *sep++ = 0;
482                 s->first_field = atoi(buf);
483                 s->last_field = atoi(sep);
484         } else
485                 s->first_field = s->last_field = atoi(buf);
486
487         return NULL;
488 }
489
490 static void finish_parse_selectors(void)
491 {
492         if (!selectors_count(&selectors))
493                 parse_selector("-");
494 }
495
496 static void select_fields(void)
497 {
498         for (int i = 0; i < selectors_count(&selectors); i++) {
499                 struct selector *s = selectors_nth(&selectors, i);
500                 int first = s->first_field;
501                 if (first <= 0)
502                         first = 1;
503                 int last = s->last_field;
504                 if (last <= 0)
505                         last = fields_count(&in_fields);
506                 for (int j = first; j <= last; j++) {
507                         struct field *f = fields_push(&out_fields);
508                         if (j >= 1 && j <= fields_count(&in_fields))
509                                 *f = *fields_nth(&in_fields, j-1);
510                         else
511                                 f->start_pos = f->len = 0;
512                 }
513         }
514 }
515
516 /*** Processing of files ***/
517
518 static void one_pass(void)
519 {
520         line_number = 0;
521         for (;;) {
522                 line_number++;
523                 fields_reset(&in_fields);
524                 line_reset(&in_line);
525                 in_field = NULL;
526                 if (!in_format->read_line())
527                         break;
528                 if (ferror_unlocked(stdin))
529                         die("I/O error when reading standard input");
530
531                 if (want_trim)
532                         trim_fields();
533
534                 fields_reset(&out_fields);
535                 select_fields();
536
537                 out_format->write_line();
538                 if (ferror_unlocked(stdout))
539                         die("I/O error when writing standard input");
540         }
541 }
542
543 static void two_pass(void)
544 {
545         struct format *final_format = out_format;
546
547         // We need to use character set info from the current locale
548         setlocale(LC_CTYPE, "");
549
550         // Pass 1: Set up writer of intermediate format
551         out_format = xmalloc_zero(sizeof(*out_format));
552         out_format->id = FORM_TMP;
553         out_format->read_line = tmp_read;
554         out_format->write_line = tmp_write;
555         out_format->tmp_file = tmpfile();
556         intarray_init(&out_format->column_widths);
557         one_pass();
558
559         // Pass 2: Set up reader of intermediate format
560         in_format = out_format;
561         rewind(in_format->tmp_file);
562         out_format = final_format;
563         one_pass();
564         fclose(in_format->tmp_file);
565 }
566
567 /*** Parsing of arguments ***/
568
569 static void NONRET usage(void)
570 {
571         printf("\
572 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
573 \n\
574 Formats:\n\
575 -t, --tsv               TAB-separated values (default)\n\
576 -c, --csv               Comma-separated values\n\
577 -w, --ws                Values separated by arbitrary whitespace\n\
578 -W, --strict-ws         Like --ws, but recognize empty columns at start/end\n\
579 -r, --regex=<rx>        Separator given by Perl regular expression (input only)\n\
580     --table             Format a table (output only)\n\
581 \n\
582 Format parameters:\n\
583 -d, --fs=<char>         Delimiter of fields\n\
584 -q, --quiet             Do not show warnings\n\
585     --always-quote      Put quotes around all fields (CSV output only)\n\
586     --table-sep=<n>     Separate table columns by <n> spaces (default: 2)\n\
587 \n\
588 Other options:\n\
589     --trim              Trim leading and trailing whitespaces in fields\n\
590 ");
591         exit(0);
592 }
593
594 static void NONRET bad_args(const char *msg, ...)
595 {
596         if (msg) {
597                 va_list args;
598                 va_start(args, msg);
599                 fprintf(stderr, "xsv: ");
600                 vfprintf(stderr, msg, args);
601                 fputc('\n', stderr);
602                 va_end(args);
603         }
604         fprintf(stderr, "Try `xsv --help' for more information.\n");
605         exit(1);
606 }
607
608 static const char short_options[] = "cd:qr:twW";
609
610 enum long_options {
611         OPT_HELP = 256,
612         OPT_TRIM,
613         OPT_ALWAYS_QUOTE,
614         OPT_TABLE,
615         OPT_TABLE_SEP,
616 };
617
618 static const struct option long_options[] = {
619         { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
620         { "csv",                0,      NULL,   'c' },
621         { "fs",                 1,      NULL,   'd' },
622         { "quiet",              0,      NULL,   'q' },
623         { "regex",              1,      NULL,   'r' },
624         { "strict-ws",          0,      NULL,   'W' },
625         { "table",              0,      NULL,   OPT_TABLE },
626         { "table-sep",          1,      NULL,   OPT_TABLE_SEP },
627         { "trim",               0,      NULL,   OPT_TRIM },
628         { "tsv",                0,      NULL,   't' },
629         { "ws",                 0,      NULL,   'w' },
630         { "help",               0,      NULL,   OPT_HELP },
631         { NULL,                 0,      NULL,   0 },
632 };
633
634 static void set_format(int format_id)
635 {
636         struct format *f = xmalloc_zero(sizeof(*f));
637         f->id = format_id;
638
639         switch (format_id) {
640                 case FORM_TSV:
641                         f->fs = '\t';
642                         f->quote = -1;
643                         f->read_line = csv_read;
644                         f->write_line = csv_write;
645                         break;
646                 case FORM_CSV:
647                         f->fs = ',';
648                         f->quote = '"';
649                         f->read_line = csv_read;
650                         f->write_line = csv_write;
651                         break;
652                 case FORM_WS:
653                         f->fs = ' ';
654                         f->quote = -1;
655                         f->read_line = ws_read;
656                         f->write_line = csv_write;
657                         break;
658                 case FORM_REGEX:
659                         f->read_line = regex_read;
660                         break;
661                 case FORM_TABLE:
662                         f->write_line = table_write;
663                         f->needs_two_passes = 1;
664                         f->table_sep = 2;
665                         break;
666         }
667
668         if (!in_format)
669                 in_format = f;
670         else if (!out_format)
671                 out_format = f;
672         else
673                 bad_args("At most two formats may be given.");
674 }
675
676 static struct format *current_format(void)
677 {
678         if (out_format)
679                 return out_format;
680         if (in_format)
681                 return in_format;
682         set_format(FORM_TSV);
683         return in_format;
684 }
685
686 int main(int argc, char **argv)
687 {
688         int opt;
689         const char *err;
690
691         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
692                 switch (opt) {
693                         case 'c':
694                                 set_format(FORM_CSV);
695                                 break;
696                         case 'd':
697                                 if (optarg[0])
698                                         current_format()->fs = optarg[0];
699                                 else
700                                         bad_args("No field delimiter given.");
701                                 break;
702                         case 'q':
703                                 current_format()->quiet = 1;
704                                 break;
705                         case 'r':
706                                 set_format(FORM_REGEX);
707                                 err = regex_set(current_format(), optarg);
708                                 if (err)
709                                         bad_args("Error compiling regex: %s", err);
710                                 break;
711                         case 't':
712                                 set_format(FORM_TSV);
713                                 break;
714                         case 'w':
715                                 set_format(FORM_WS);
716                                 break;
717                         case 'W':
718                                 set_format(FORM_WS);
719                                 current_format()->strict_ws = 1;
720                                 break;
721                         case OPT_ALWAYS_QUOTE:
722                                 if (current_format()->id != FORM_CSV)
723                                         bad_args("--always-quote makes sense only for CSV.");
724                                 current_format()->always_quote = 1;
725                                 break;
726                         case OPT_HELP:
727                                 usage();
728                         case OPT_TRIM:
729                                 want_trim = 1;
730                                 break;
731                         case OPT_TABLE:
732                                 set_format(FORM_TABLE);
733                                 break;
734                         case OPT_TABLE_SEP:
735                                 current_format()->table_sep = atoi(optarg);
736                                 break;
737                         default:
738                                 bad_args(NULL);
739                 }
740
741         current_format();
742         if (!out_format)
743                 out_format = in_format;
744         if (!in_format->read_line)
745                 bad_args("Write-only format selected for input.");
746         if (!out_format->write_line)
747                 bad_args("Read-only format selected for output.");
748
749         for (int i = optind; i < argc; i++) {
750                 err = parse_selector(argv[i]);
751                 if (err)
752                         bad_args(err);
753         }
754         finish_parse_selectors();
755
756         fields_init(&in_fields);
757         fields_init(&out_fields);
758         line_init(&in_line);
759
760         if (out_format->needs_two_passes)
761                 two_pass();
762         else
763                 one_pass();
764         return 0;
765 }