]> mj.ucw.cz Git - xsv.git/blob - xsv.c
4ebe79844cf98cd2820dece4f54007b859fcda9e
[xsv.git] / xsv.c
1 /*
2  *      A Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #define _GNU_SOURCE
8
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdarg.h>
13 #include <getopt.h>
14 #include <wchar.h>
15 #include <locale.h>
16
17 #include <pcre.h>
18
19 #ifdef __GNUC__
20 #define NONRET __attribute__((noreturn))
21 #else
22 #define NONRET
23 #endif
24
25 /*** General functions ***/
26
27 static void NONRET die(char *msg, ...)
28 {
29         va_list args;
30         va_start(args, msg);
31         fprintf(stderr, "xsv: ");
32         vfprintf(stderr, msg, args);
33         fputc('\n', stderr);
34         va_end(args);
35         exit(1);
36 }
37
38 /*** Memory allocation ***/
39
40 static void *xmalloc(size_t bytes)
41 {
42         void *p = malloc(bytes);
43         if (!p)
44                 die("Out of memory (cannot allocate %zu bytes)", bytes);
45         return p;
46 }
47
48 static void *xmalloc_zero(size_t bytes)
49 {
50         void *p = xmalloc(bytes);
51         memset(p, 0, bytes);
52         return p;
53 }
54
55 static void *xrealloc(void *old, size_t bytes)
56 {
57         void *p = realloc(old, bytes);
58         if (!p)
59                 die("Out of memory (cannot allocate %zu bytes)", bytes);
60         return p;
61 }
62
63 #define DECLARE_BUF(name, type) \
64         typedef struct { type *start; int count; int max; } name##_t;                           \
65         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
66         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
67         static inline int name##_count(name##_t *b) { return b->count; }                        \
68         static void name##_extend(name##_t *b) {                                                \
69                 b->max = b->max ? 2*b->max : 16;                                                \
70                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
71         }                                                                                       \
72         static inline type *name##_push(name##_t *b) {                                          \
73                 if (b->count >= b->max) name##_extend(b);                                       \
74                 return &b->start[b->count++];                                                   \
75         }                                                                                       \
76         static inline type *name##_first(name##_t *b) { return b->start; }                      \
77         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
78         // end
79
80 DECLARE_BUF(intarray, int);
81
82 /*** Formats and their parameters ***/
83
84 enum format_id {
85         FORM_UNSPEC,
86         FORM_TSV,
87         FORM_CSV,
88         FORM_WS,
89         FORM_REGEX,
90         FORM_TMP,
91         FORM_TABLE,
92 };
93
94 struct format {
95         enum format_id id;
96         int fs;
97         int quote;
98         int quiet;
99         int (*read_line)(struct format *fmt);
100         void (*write_line)(struct format *fmt);
101         int needs_stats;
102
103         // CSV backend:
104         int always_quote;
105
106         // WS backend:
107         int strict_ws;
108
109         // regex backend:
110         pcre *pcre;
111         pcre_extra *pcre_extra;
112
113         // Temporary file backend:
114         FILE *tmp_file;
115
116         // Table backend:
117         int table_sep;
118 };
119
120 static struct format *in_format, *out_format;
121 static int want_trim;
122
123 struct field {
124         int start_pos;
125         int len;
126 };
127
128 DECLARE_BUF(fields, struct field);
129 DECLARE_BUF(line, unsigned char);
130
131 static fields_t in_fields, out_fields;
132 static struct field *in_field;
133 static line_t in_line;
134 static int line_number;
135
136 static void new_field(int pos)
137 {
138         in_field = fields_push(&in_fields);
139         in_field->start_pos = pos;
140         in_field->len = 0;
141 }
142
143 static void ensure_field(int pos)
144 {
145         if (!in_field)
146                 new_field(pos);
147 }
148
149 static void warn(struct format *fmt, char *msg, ...)
150 {
151         if (!fmt->quiet) {
152                 fprintf(stderr, "Warning at line %d: ", line_number);
153                 va_list args;
154                 va_start(args, msg);
155                 vfprintf(stderr, msg, args);
156                 va_end(args);
157                 fputc('\n', stderr);
158         }
159 }
160
161 static int next_line(void)
162 {
163         for (;;) {
164                 int c = getchar_unlocked();
165                 if (c == '\r')
166                         continue;
167                 if (c < 0)
168                         return !!line_count(&in_line);
169                 if (c == '\n')
170                         return 1;
171                 *line_push(&in_line) = c;
172         }
173 }
174
175 static int field_chars(struct field *f)
176 {
177         unsigned char *s = line_nth(&in_line, f->start_pos);
178         int i = 0;
179         mbstate_t mbs;
180         memset(&mbs, 0, sizeof(mbs));
181
182         int chars = 0;
183         while (i < f->len) {
184                 size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
185                 if ((int) k <= 0)
186                         break;
187                 i += k;
188                 chars++;
189         }
190
191         return chars;
192 }
193
194 /*** Field statistics ***/
195
196 static intarray_t column_widths;
197
198 static void update_stats(void)
199 {
200         for (int i = 0; i < fields_count(&in_fields); i++) {
201                 struct field *f = fields_nth(&in_fields, i);
202                 intarray_t *w = &column_widths;
203
204                 while (i >= intarray_count(w))
205                         *intarray_push(w) = 0;
206                 int fw = field_chars(f);
207                 if (*intarray_nth(w, i) < fw)
208                         *intarray_nth(w, i) = fw;
209         }
210 }
211
212 /*** CSV/TSV back-end */
213
214 static int csv_read(struct format *fmt)
215 {
216         int quoted = 0;
217         for (;;) {
218                 int c = getchar_unlocked();
219                 int i = line_count(&in_line);
220 restart:
221                 if (c == '\r')
222                         continue;
223                 if (c < 0 || c == '\n') {
224                         if (quoted)
225                                 warn(fmt, "Missing closing quote.");
226                         if (c < 0)
227                                 return !!fields_count(&in_fields);
228                         else
229                                 return 1;
230                 }
231                 if (quoted) {
232                         if (c == fmt->quote) {
233                                 c = getchar_unlocked();
234                                 if (c != fmt->quote) {
235                                         quoted = 0;
236                                         goto restart;
237                                 }
238                                 // Two quotes assimilate to one
239                         }
240                         // Fall through to pushing the character
241                 } else if (c == fmt->quote) {
242                         quoted = 1;
243                         continue;
244                 } else if (c == fmt->fs && !quoted) {
245                         ensure_field(i);
246                         new_field(i);
247                         continue;
248                 }
249                 ensure_field(i);
250                 *line_push(&in_line) = c;
251                 in_field->len++;
252         }
253 }
254
255 static int is_ws(int c)
256 {
257         return (c == ' ' || c == '\t' || c == '\f');
258 }
259
260 static void csv_write(struct format *fmt)
261 {
262         unsigned char *line = line_first(&in_line);
263         int n = fields_count(&out_fields);
264         for (int i=0; i<n; i++) {
265                 struct field *f = fields_nth(&out_fields, i);
266                 int need_quotes = 0;
267                 if (fmt->quote >= 0) {
268                         need_quotes = fmt->always_quote;
269                         for (int j=0; !need_quotes && j < f->len; j++) {
270                                 int c = line[f->start_pos + j];
271                                 if (c == fmt->fs || c == fmt->quote)
272                                         need_quotes = 1;
273                         }
274                 }
275                 if (i)
276                         putchar_unlocked(fmt->fs);
277                 if (need_quotes)
278                         putchar_unlocked(fmt->quote);
279                 for (int j=0; j < f->len; j++) {
280                         int c = line[f->start_pos + j];
281                         if (c == fmt->fs && !need_quotes)
282                                 warn(fmt, "Field separator found inside field and quoting is turned off.");
283                         if (c == fmt->quote)
284                                 putchar_unlocked(c);
285                         putchar_unlocked(c);
286                 }
287                 if (need_quotes)
288                         putchar_unlocked(fmt->quote);
289         }
290         putchar_unlocked('\n');
291 }
292
293 /*** White-space back-end ***/
294
295 static int ws_read(struct format *fmt)
296 {
297         if (!next_line())
298                 return 0;
299
300         unsigned char *line = line_first(&in_line);
301         int n = line_count(&in_line);
302         if (!n)
303                 return 1;
304
305         int ws = 0;
306         new_field(0);
307         for (int i=0; i<n; i++) {
308                 int c = line[i];
309                 if (is_ws(c)) {
310                         ws++;
311                 } else {
312                         if (ws) {
313                                 if (!in_field->start_pos &&
314                                     !in_field->len &&
315                                     !fmt->strict_ws)
316                                         in_field->start_pos = i;
317                                 else
318                                         new_field(i);
319                                 ws = 0;
320                         }
321                         in_field->len++;
322                 }
323         }
324
325         if (ws && fmt->strict_ws)
326                 new_field(n);
327         return 1;
328 }
329
330 /*** Regex back-end ***/
331
332 static const char *regex_set(struct format *f, char *rx)
333 {
334         const char *err;
335         int errpos;
336         f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
337         if (!f->pcre)
338                 return err;
339
340         f->pcre_extra = pcre_study(f->pcre, 0, &err);
341         if (!f->pcre_extra)
342                 return err;
343
344         return NULL;
345 }
346
347 static int regex_read(struct format *fmt)
348 {
349         if (!next_line())
350                 return 0;
351
352         unsigned char *c = line_first(&in_line);
353         int n = line_count(&in_line);
354         if (!n)
355                 return 1;
356
357         int i = 0;
358         for (;;) {
359                 int ovec[3];
360                 int sep = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
361                 if (sep < 0) {
362                         if (sep != PCRE_ERROR_NOMATCH)
363                                 warn(fmt, "PCRE matching error %d", sep);
364                         // No further occurrence of the separator: the rest is a single field
365                         new_field(i);
366                         in_field->len = n - i;
367                         return 1;
368                 }
369                 new_field(i);
370                 in_field->len = ovec[0] - i;
371                 i = ovec[1];
372         }
373 }
374
375 /*** Table back-end ***/
376
377 static void table_write(struct format *fmt)
378 {
379         for (int i = 0; i < fields_count(&in_fields); i++) {
380                 if (i)
381                         printf("%*s", fmt->table_sep, "");
382                 struct field *f = fields_nth(&in_fields, i);
383                 int fw = field_chars(f);
384                 int cw = *intarray_nth(&column_widths, i);
385                 if (fw > cw) {
386                         warn(fmt, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
387                         cw = fw;
388                 }
389                 unsigned char *p = line_nth(&in_line, f->start_pos);
390                 for (int j = 0; j < f->len; j++)
391                         putchar_unlocked(p[j]);
392                 while (fw < cw) {
393                         putchar_unlocked(' ');
394                         fw++;
395                 }
396         }
397         putchar_unlocked('\n');
398 }
399
400 /*** Temporary file back-end ***/
401
402 static int tmp_read(struct format *fmt)
403 {
404         FILE *tf = fmt->tmp_file;
405
406         for (;;) {
407                 int c = getc_unlocked(tf);
408                 if (c < 0)
409                         return 0;
410                 if (c == 0xff)
411                         return 1;
412                 if (c == 0xfe) {
413                         c = getc_unlocked(tf);
414                         c = (c << 8) | getc_unlocked(tf);
415                         c = (c << 8) | getc_unlocked(tf);
416                         c = (c << 8) | getc_unlocked(tf);
417                 }
418                 new_field(line_count(&in_line));
419                 in_field->len = c;
420                 while (c--) {
421                         int x = getc_unlocked(tf);
422                         if (x < 0) {
423                                 warn(fmt, "Truncated temporary file");
424                                 return 0;
425                         }
426                         *line_push(&in_line) = x;
427                 }
428         }
429
430         if (ferror_unlocked(tf))
431                 die("I/O error when reading temporary file");
432 }
433
434 static void tmp_write(struct format *fmt)
435 {
436         FILE *tf = fmt->tmp_file;
437
438         for (int i = 0; i < fields_count(&in_fields); i++) {
439                 struct field *f = fields_nth(&in_fields, i);
440                 if (f->len < 0xfe)
441                         putc_unlocked(f->len, tf);
442                 else {
443                         putc_unlocked(0xfe, tf);
444                         putc_unlocked((f->len >> 24) & 0xff, tf);
445                         putc_unlocked((f->len >> 16) & 0xff, tf);
446                         putc_unlocked((f->len >> 8) & 0xff, tf);
447                         putc_unlocked(f->len & 0xff, tf);
448                 }
449
450                 unsigned char *p = line_nth(&in_line, f->start_pos);
451                 for (int j = 0; j < f->len; j++)
452                         putc_unlocked(*p++, tf);
453         }
454         putc_unlocked(0xff, tf);
455
456         if (ferror_unlocked(tf))
457                 die("I/O error when writing temporary file");
458 }
459
460 /*** Transforms ***/
461
462 static void trim_fields(void)
463 {
464         unsigned char *line = line_first(&in_line);
465         for (int i = 0; i < fields_count(&in_fields); i++) {
466                 struct field *f = fields_nth(&in_fields, i);
467                 while (f->len && is_ws(line[f->start_pos]))
468                         f->start_pos++, f->len--;
469                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
470                         f->len--;
471         }
472 }
473
474 /*** Field selection ***/
475
476 struct selector {
477         int first_field, last_field;
478 };
479
480 DECLARE_BUF(selectors, struct selector);
481 static selectors_t selectors;
482
483 static char *parse_selector(char *str)
484 {
485         char buf[strlen(str) + 1];
486         strcpy(buf, str);
487
488         struct selector *s = selectors_push(&selectors);
489         char *sep = strchr(buf, '-');
490         if (sep) {
491                 *sep++ = 0;
492                 s->first_field = atoi(buf);
493                 s->last_field = atoi(sep);
494         } else
495                 s->first_field = s->last_field = atoi(buf);
496
497         return NULL;
498 }
499
500 static void finish_parse_selectors(void)
501 {
502         if (!selectors_count(&selectors))
503                 parse_selector("-");
504 }
505
506 static void select_fields(void)
507 {
508         for (int i = 0; i < selectors_count(&selectors); i++) {
509                 struct selector *s = selectors_nth(&selectors, i);
510                 int first = s->first_field;
511                 if (first <= 0)
512                         first = 1;
513                 int last = s->last_field;
514                 if (last <= 0)
515                         last = fields_count(&in_fields);
516                 for (int j = first; j <= last; j++) {
517                         struct field *f = fields_push(&out_fields);
518                         if (j >= 1 && j <= fields_count(&in_fields))
519                                 *f = *fields_nth(&in_fields, j-1);
520                         else
521                                 f->start_pos = f->len = 0;
522                 }
523         }
524 }
525
526 /*** Processing of files ***/
527
528 static void one_pass(void)
529 {
530         line_number = 0;
531         for (;;) {
532                 line_number++;
533                 fields_reset(&in_fields);
534                 line_reset(&in_line);
535                 in_field = NULL;
536                 if (!in_format->read_line(in_format))
537                         break;
538                 if (ferror_unlocked(stdin))
539                         die("I/O error when reading standard input");
540
541                 if (want_trim)
542                         trim_fields();
543
544                 fields_reset(&out_fields);
545                 select_fields();
546
547                 if (out_format->needs_stats)
548                         update_stats();
549                 out_format->write_line(out_format);
550                 if (ferror_unlocked(stdout))
551                         die("I/O error when writing standard input");
552         }
553 }
554
555 static void two_pass(void)
556 {
557         struct format *final_format = out_format;
558
559         // We need to use character set info from the current locale
560         setlocale(LC_CTYPE, "");
561
562         // Pass 1: Set up writer of intermediate format
563         out_format = xmalloc_zero(sizeof(*out_format));
564         out_format->id = FORM_TMP;
565         out_format->read_line = tmp_read;
566         out_format->write_line = tmp_write;
567         out_format->tmp_file = tmpfile();
568         out_format->needs_stats = final_format->needs_stats;
569         one_pass();
570
571         // Pass 2: Set up reader of intermediate format
572         in_format = out_format;
573         rewind(in_format->tmp_file);
574         out_format = final_format;
575         out_format->needs_stats = 0;
576         one_pass();
577         fclose(in_format->tmp_file);
578 }
579
580 /*** Parsing of arguments ***/
581
582 static void NONRET usage(void)
583 {
584         printf("\
585 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
586 \n\
587 Formats:\n\
588 -t, --tsv               TAB-separated values (default)\n\
589 -c, --csv               Comma-separated values\n\
590 -w, --ws                Values separated by arbitrary whitespace\n\
591 -W, --strict-ws         Like --ws, but recognize empty columns at start/end\n\
592 -r, --regex=<rx>        Separator given by Perl regular expression (input only)\n\
593     --table             Format a table (output only)\n\
594 \n\
595 Format parameters:\n\
596 -d, --fs=<char>         Delimiter of fields\n\
597 -q, --quiet             Do not show warnings\n\
598     --always-quote      Put quotes around all fields (CSV output only)\n\
599     --table-sep=<n>     Separate table columns by <n> spaces (default: 2)\n\
600 \n\
601 Other options:\n\
602     --trim              Trim leading and trailing whitespaces in fields\n\
603 ");
604         exit(0);
605 }
606
607 static void NONRET bad_args(const char *msg, ...)
608 {
609         if (msg) {
610                 va_list args;
611                 va_start(args, msg);
612                 fprintf(stderr, "xsv: ");
613                 vfprintf(stderr, msg, args);
614                 fputc('\n', stderr);
615                 va_end(args);
616         }
617         fprintf(stderr, "Try `xsv --help' for more information.\n");
618         exit(1);
619 }
620
621 static const char short_options[] = "cd:qr:twW";
622
623 enum long_options {
624         OPT_HELP = 256,
625         OPT_TRIM,
626         OPT_ALWAYS_QUOTE,
627         OPT_TABLE,
628         OPT_TABLE_SEP,
629 };
630
631 static const struct option long_options[] = {
632         { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
633         { "csv",                0,      NULL,   'c' },
634         { "fs",                 1,      NULL,   'd' },
635         { "quiet",              0,      NULL,   'q' },
636         { "regex",              1,      NULL,   'r' },
637         { "strict-ws",          0,      NULL,   'W' },
638         { "table",              0,      NULL,   OPT_TABLE },
639         { "table-sep",          1,      NULL,   OPT_TABLE_SEP },
640         { "trim",               0,      NULL,   OPT_TRIM },
641         { "tsv",                0,      NULL,   't' },
642         { "ws",                 0,      NULL,   'w' },
643         { "help",               0,      NULL,   OPT_HELP },
644         { NULL,                 0,      NULL,   0 },
645 };
646
647 static void set_format(int format_id)
648 {
649         struct format *f = xmalloc_zero(sizeof(*f));
650         f->id = format_id;
651
652         switch (format_id) {
653                 case FORM_TSV:
654                         f->fs = '\t';
655                         f->quote = -1;
656                         f->read_line = csv_read;
657                         f->write_line = csv_write;
658                         break;
659                 case FORM_CSV:
660                         f->fs = ',';
661                         f->quote = '"';
662                         f->read_line = csv_read;
663                         f->write_line = csv_write;
664                         break;
665                 case FORM_WS:
666                         f->fs = ' ';
667                         f->quote = -1;
668                         f->read_line = ws_read;
669                         f->write_line = csv_write;
670                         break;
671                 case FORM_REGEX:
672                         f->read_line = regex_read;
673                         break;
674                 case FORM_TABLE:
675                         f->write_line = table_write;
676                         f->needs_stats = 1;
677                         f->table_sep = 2;
678                         break;
679         }
680
681         if (!in_format)
682                 in_format = f;
683         else if (!out_format)
684                 out_format = f;
685         else
686                 bad_args("At most two formats may be given.");
687 }
688
689 static struct format *current_format(void)
690 {
691         if (out_format)
692                 return out_format;
693         if (in_format)
694                 return in_format;
695         set_format(FORM_TSV);
696         return in_format;
697 }
698
699 int main(int argc, char **argv)
700 {
701         int opt;
702         const char *err;
703
704         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
705                 switch (opt) {
706                         case 'c':
707                                 set_format(FORM_CSV);
708                                 break;
709                         case 'd':
710                                 if (optarg[0])
711                                         current_format()->fs = optarg[0];
712                                 else
713                                         bad_args("No field delimiter given.");
714                                 break;
715                         case 'q':
716                                 current_format()->quiet = 1;
717                                 break;
718                         case 'r':
719                                 set_format(FORM_REGEX);
720                                 err = regex_set(current_format(), optarg);
721                                 if (err)
722                                         bad_args("Error compiling regex: %s", err);
723                                 break;
724                         case 't':
725                                 set_format(FORM_TSV);
726                                 break;
727                         case 'w':
728                                 set_format(FORM_WS);
729                                 break;
730                         case 'W':
731                                 set_format(FORM_WS);
732                                 current_format()->strict_ws = 1;
733                                 break;
734                         case OPT_ALWAYS_QUOTE:
735                                 if (current_format()->id != FORM_CSV)
736                                         bad_args("--always-quote makes sense only for CSV.");
737                                 current_format()->always_quote = 1;
738                                 break;
739                         case OPT_HELP:
740                                 usage();
741                         case OPT_TRIM:
742                                 want_trim = 1;
743                                 break;
744                         case OPT_TABLE:
745                                 set_format(FORM_TABLE);
746                                 break;
747                         case OPT_TABLE_SEP:
748                                 current_format()->table_sep = atoi(optarg);
749                                 break;
750                         default:
751                                 bad_args(NULL);
752                 }
753
754         current_format();
755         if (!out_format)
756                 out_format = in_format;
757         if (!in_format->read_line)
758                 bad_args("Write-only format selected for input.");
759         if (!out_format->write_line)
760                 bad_args("Read-only format selected for output.");
761
762         for (int i = optind; i < argc; i++) {
763                 err = parse_selector(argv[i]);
764                 if (err)
765                         bad_args(err);
766         }
767         finish_parse_selectors();
768
769         if (out_format->needs_stats)
770                 two_pass();
771         else
772                 one_pass();
773         return 0;
774 }