]> mj.ucw.cz Git - xsv.git/blob - xsv.c
5dfa80f15cad1903defda8213204fb6d3509af56
[xsv.git] / xsv.c
1 /*
2  *      A Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <stdarg.h>
11 #include <getopt.h>
12 #include <wchar.h>
13 #include <locale.h>
14
15 #include <pcre.h>
16
17 /*** Memory allocation ***/
18
19 static void *xmalloc(size_t bytes)
20 {
21         void *p = malloc(bytes);
22         if (!p) {
23                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
24                 exit(1);
25         }
26         return p;
27 }
28
29 static void *xmalloc_zero(size_t bytes)
30 {
31         void *p = xmalloc(bytes);
32         memset(p, 0, bytes);
33         return p;
34 }
35
36 static void *xrealloc(void *old, size_t bytes)
37 {
38         void *p = realloc(old, bytes);
39         if (!p) {
40                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
41                 exit(1);
42         }
43         return p;
44 }
45
46 #define DECLARE_BUF(name, type) \
47         typedef struct { type *start; int count; int max; } name##_t;                           \
48         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
49         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
50         static inline int name##_count(name##_t *b) { return b->count; }                        \
51         static void name##_extend(name##_t *b) {                                                \
52                 b->max = b->max ? 2*b->max : 16;                                                \
53                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
54         }                                                                                       \
55         static inline type *name##_push(name##_t *b) {                                          \
56                 if (b->count >= b->max) name##_extend(b);                                       \
57                 return &b->start[b->count++];                                                   \
58         }                                                                                       \
59         static inline type *name##_first(name##_t *b) { return b->start; }                      \
60         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
61         // end
62
63 DECLARE_BUF(intarray, int);
64
65 /*** Formats and their parameters ***/
66
67 enum format_id {
68         FORM_UNSPEC,
69         FORM_TSV,
70         FORM_CSV,
71         FORM_WS,
72         FORM_REGEX,
73         FORM_TMP,
74         FORM_TABLE,
75 };
76
77 struct format {
78         enum format_id id;
79         int fs;
80         int quote;
81         int quiet;
82         int (*read_line)(void);
83         void (*write_line)(void);
84         int needs_two_passes;
85
86         // CSV backend:
87         int always_quote;
88
89         // WS backend:
90         int strict_ws;
91
92         // regex backend:
93         pcre *pcre;
94         pcre_extra *pcre_extra;
95
96         // Temporary file backend:
97         FILE *tmp_file;
98         intarray_t column_widths;
99
100         // Table backend:
101         int table_sep;
102 };
103
104 static struct format *in_format, *out_format;
105 static int want_trim;
106
107 struct field {
108         int start_pos;
109         int len;
110 };
111
112 DECLARE_BUF(fields, struct field);
113 DECLARE_BUF(line, unsigned char);
114
115 static fields_t in_fields, out_fields;
116 static struct field *in_field;
117 static line_t in_line;
118 static int line_number;
119
120 static void new_field(int pos)
121 {
122         in_field = fields_push(&in_fields);
123         in_field->start_pos = pos;
124         in_field->len = 0;
125 }
126
127 static void ensure_field(int pos)
128 {
129         if (!in_field)
130                 new_field(pos);
131 }
132
133 static void warn(struct format *fmt, char *msg, ...)
134 {
135         if (!fmt->quiet) {
136                 fprintf(stderr, "Warning at line %d: ", line_number);
137                 va_list args;
138                 va_start(args, msg);
139                 vfprintf(stderr, msg, args);
140                 va_end(args);
141                 fputc('\n', stderr);
142         }
143 }
144
145 static int next_line(void)
146 {
147         for (;;) {
148                 int c = getchar();
149                 if (c == '\r')
150                         continue;
151                 if (c < 0)
152                         return !!line_count(&in_line);
153                 if (c == '\n')
154                         return 1;
155                 *line_push(&in_line) = c;
156         }
157 }
158
159 static int field_chars(struct field *f)
160 {
161         unsigned char *s = line_nth(&in_line, f->start_pos);
162         int i = 0;
163         mbstate_t mbs;
164         memset(&mbs, 0, sizeof(mbs));
165
166         int chars = 0;
167         while (i < f->len) {
168                 size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
169                 if ((int) k <= 0)
170                         break;
171                 i += k;
172                 chars++;
173         }
174
175         return chars;
176 }
177
178 /*** CSV/TSV back-end */
179
180 static int csv_read(void)
181 {
182         int quoted = 0;
183         for (;;) {
184                 int c = getchar();
185                 int i = line_count(&in_line);
186 restart:
187                 if (c == '\r')
188                         continue;
189                 if (c < 0 || c == '\n') {
190                         if (quoted)
191                                 warn(in_format, "Missing closing quote.");
192                         if (c < 0)
193                                 return !!fields_count(&in_fields);
194                         else
195                                 return 1;
196                 }
197                 if (quoted) {
198                         if (c == in_format->quote) {
199                                 c = getchar();
200                                 if (c != in_format->quote) {
201                                         quoted = 0;
202                                         goto restart;
203                                 }
204                                 // Two quotes assimilate to one
205                         }
206                         // Fall through to pushing the character
207                 } else if (c == in_format->quote) {
208                         quoted = 1;
209                         continue;
210                 } else if (c == in_format->fs && !quoted) {
211                         ensure_field(i);
212                         new_field(i);
213                         continue;
214                 }
215                 ensure_field(i);
216                 *line_push(&in_line) = c;
217                 in_field->len++;
218         }
219 }
220
221 static int is_ws(int c)
222 {
223         return (c == ' ' || c == '\t' || c == '\f');
224 }
225
226 static void csv_write(void)
227 {
228         unsigned char *line = line_first(&in_line);
229         int n = fields_count(&out_fields);
230         for (int i=0; i<n; i++) {
231                 struct field *f = fields_nth(&out_fields, i);
232                 int need_quotes = 0;
233                 if (out_format->quote >= 0) {
234                         need_quotes = out_format->always_quote;
235                         for (int j=0; !need_quotes && j < f->len; j++) {
236                                 int c = line[f->start_pos + j];
237                                 if (c == out_format->fs || c == out_format->quote)
238                                         need_quotes = 1;
239                         }
240                 }
241                 if (i)
242                         putchar(out_format->fs);
243                 if (need_quotes)
244                         putchar(out_format->quote);
245                 for (int j=0; j < f->len; j++) {
246                         int c = line[f->start_pos + j];
247                         if (c == out_format->fs && !need_quotes)
248                                 warn(out_format, "Field separator found inside field and quoting is turned off.");
249                         if (c == out_format->quote)
250                                 putchar(c);
251                         putchar(c);
252                 }
253                 if (need_quotes)
254                         putchar(out_format->quote);
255         }
256         putchar('\n');
257 }
258
259 /*** White-space back-end ***/
260
261 static int ws_read(void)
262 {
263         if (!next_line())
264                 return 0;
265
266         unsigned char *line = line_first(&in_line);
267         int n = line_count(&in_line);
268         if (!n)
269                 return 1;
270
271         int ws = 0;
272         new_field(0);
273         for (int i=0; i<n; i++) {
274                 int c = line[i];
275                 if (is_ws(c)) {
276                         ws++;
277                 } else {
278                         if (ws) {
279                                 if (!in_field->start_pos &&
280                                     !in_field->len &&
281                                     !in_format->strict_ws)
282                                         in_field->start_pos = i;
283                                 else
284                                         new_field(i);
285                                 ws = 0;
286                         }
287                         in_field->len++;
288                 }
289         }
290
291         if (ws && in_format->strict_ws)
292                 new_field(n);
293         return 1;
294 }
295
296 /*** Regex back-end ***/
297
298 static const char *regex_set(struct format *f, char *rx)
299 {
300         const char *err;
301         int errpos;
302         f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
303         if (!f->pcre)
304                 return err;
305
306         f->pcre_extra = pcre_study(f->pcre, 0, &err);
307         if (!f->pcre_extra)
308                 return err;
309
310         return NULL;
311 }
312
313 static int regex_read(void)
314 {
315         if (!next_line())
316                 return 0;
317
318         unsigned char *c = line_first(&in_line);
319         int n = line_count(&in_line);
320         if (!n)
321                 return 1;
322
323         int i = 0;
324         for (;;) {
325                 int ovec[3];
326                 int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3);
327                 if (sep < 0) {
328                         if (sep != PCRE_ERROR_NOMATCH)
329                                 warn(in_format, "PCRE matching error %d", sep);
330                         // No further occurrence of the separator: the rest is a single field
331                         new_field(i);
332                         in_field->len = n - i;
333                         return 1;
334                 }
335                 new_field(i);
336                 in_field->len = ovec[0] - i;
337                 i = ovec[1];
338         }
339 }
340
341 /*** Table back-end ***/
342
343 static void table_write(void)
344 {
345         for (int i = 0; i < fields_count(&in_fields); i++) {
346                 if (i)
347                         printf("%*s", out_format->table_sep, "");
348                 struct field *f = fields_nth(&in_fields, i);
349                 int fw = field_chars(f);
350                 int cw = *intarray_nth(&in_format->column_widths, i);
351                 if (fw > cw) {
352                         warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
353                         cw = fw;
354                 }
355                 unsigned char *p = line_nth(&in_line, f->start_pos);
356                 for (int j = 0; j < f->len; j++)
357                         putchar(p[j]);
358                 while (fw < cw) {
359                         putchar(' ');
360                         fw++;
361                 }
362         }
363         putchar('\n');
364 }
365
366 /*** Temporary file back-end ***/
367
368 static int tmp_read(void)
369 {
370         FILE *tf = in_format->tmp_file;
371
372         for (;;) {
373                 int c = fgetc(tf);
374                 if (c < 0)
375                         return 0;
376                 if (c == 0xff)
377                         return 1;
378                 if (c == 0xfe) {
379                         c = fgetc(tf);
380                         c = (c << 8) | fgetc(tf);
381                         c = (c << 8) | fgetc(tf);
382                         c = (c << 8) | fgetc(tf);
383                 }
384                 new_field(line_count(&in_line));
385                 in_field->len = c;
386                 while (c--) {
387                         int x = fgetc(tf);
388                         if (x < 0) {
389                                 warn(in_format, "Truncated temporary file");
390                                 return 0;
391                         }
392                         *line_push(&in_line) = x;
393                 }
394         }
395 }
396
397 static void tmp_write(void)
398 {
399         FILE *tf = out_format->tmp_file;
400
401         for (int i = 0; i < fields_count(&in_fields); i++) {
402                 struct field *f = fields_nth(&in_fields, i);
403                 if (f->len < 0xfe)
404                         fputc(f->len, tf);
405                 else {
406                         fputc(0xfe, tf);
407                         fputc((f->len >> 24) & 0xff, tf);
408                         fputc((f->len >> 16) & 0xff, tf);
409                         fputc((f->len >> 8) & 0xff, tf);
410                         fputc(f->len & 0xff, tf);
411                 }
412
413                 unsigned char *p = line_nth(&in_line, f->start_pos);
414                 for (int j = 0; j < f->len; j++)
415                         fputc(*p++, tf);
416
417                 intarray_t *w = &out_format->column_widths;
418                 while (i >= intarray_count(w))
419                         *intarray_push(w) = 0;
420                 int fw = field_chars(f);
421                 if (*intarray_nth(w, i) < fw)
422                         *intarray_nth(w, i) = fw;
423         }
424         fputc(0xff, tf);
425 }
426
427 /*** Transforms ***/
428
429 static void trim_fields(void)
430 {
431         unsigned char *line = line_first(&in_line);
432         for (int i = 0; i < fields_count(&in_fields); i++) {
433                 struct field *f = fields_nth(&in_fields, i);
434                 while (f->len && is_ws(line[f->start_pos]))
435                         f->start_pos++, f->len--;
436                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
437                         f->len--;
438         }
439 }
440
441 /*** Field selection ***/
442
443 struct selector {
444         int first_field, last_field;
445 };
446
447 DECLARE_BUF(selectors, struct selector);
448 static selectors_t selectors;
449
450 static char *parse_selector(char *str)
451 {
452         char buf[strlen(str) + 1];
453         strcpy(buf, str);
454
455         struct selector *s = selectors_push(&selectors);
456         char *sep = strchr(buf, '-');
457         if (sep) {
458                 *sep++ = 0;
459                 s->first_field = atoi(buf);
460                 s->last_field = atoi(sep);
461         } else
462                 s->first_field = s->last_field = atoi(buf);
463
464         return NULL;
465 }
466
467 static void finish_parse_selectors(void)
468 {
469         if (!selectors_count(&selectors))
470                 parse_selector("-");
471 }
472
473 static void select_fields(void)
474 {
475         for (int i = 0; i < selectors_count(&selectors); i++) {
476                 struct selector *s = selectors_nth(&selectors, i);
477                 int first = s->first_field;
478                 if (first <= 0)
479                         first = 1;
480                 int last = s->last_field;
481                 if (last <= 0)
482                         last = fields_count(&in_fields);
483                 for (int j = first; j <= last; j++) {
484                         struct field *f = fields_push(&out_fields);
485                         if (j >= 1 && j <= fields_count(&in_fields))
486                                 *f = *fields_nth(&in_fields, j-1);
487                         else
488                                 f->start_pos = f->len = 0;
489                 }
490         }
491 }
492
493 /*** Processing of files ***/
494
495 static void one_pass(void)
496 {
497         line_number = 0;
498         for (;;) {
499                 line_number++;
500                 fields_reset(&in_fields);
501                 line_reset(&in_line);
502                 in_field = NULL;
503                 if (!in_format->read_line())
504                         break;
505
506                 if (want_trim)
507                         trim_fields();
508
509                 fields_reset(&out_fields);
510                 select_fields();
511
512                 out_format->write_line();
513         }
514 }
515
516 static void two_pass(void)
517 {
518         struct format *final_format = out_format;
519
520         // We need to use character set info from the current locale
521         setlocale(LC_CTYPE, "");
522
523         // Pass 1: Set up writer of intermediate format
524         out_format = xmalloc_zero(sizeof(*out_format));
525         out_format->id = FORM_TMP;
526         out_format->read_line = tmp_read;
527         out_format->write_line = tmp_write;
528         out_format->tmp_file = tmpfile();
529         intarray_init(&out_format->column_widths);
530         one_pass();
531
532         // Pass 2: Set up reader of intermediate format
533         in_format = out_format;
534         rewind(in_format->tmp_file);
535         out_format = final_format;
536         one_pass();
537         fclose(in_format->tmp_file);
538 }
539
540 /*** Parsing of arguments ***/
541
542 static void usage(void)
543 {
544         printf("\
545 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
546 \n\
547 Formats:\n\
548 -t, --tsv               TAB-separated values (default)\n\
549 -c, --csv               Comma-separated values\n\
550 -w, --ws                Values separated by arbitrary whitespace\n\
551 -W, --strict-ws         Like --ws, but recognize empty columns at start/end\n\
552 -r, --regex=<rx>        Separator given by Perl regular expression (input only)\n\
553     --table             Format a table (output only)\n\
554 \n\
555 Format parameters:\n\
556 -d, --fs=<char>         Delimiter of fields\n\
557 -q, --quiet             Do not show warnings\n\
558     --always-quote      Put quotes around all fields (CSV output only)\n\
559     --table-sep=<n>     Separate table columns by <n> spaces (default: 2)\n\
560 \n\
561 Other options:\n\
562     --trim              Trim leading and trailing whitespaces in fields\n\
563 ");
564         exit(0);
565 }
566
567 static void bad_args(const char *msg, ...)
568 {
569         if (msg) {
570                 va_list args;
571                 va_start(args, msg);
572                 fprintf(stderr, "xsv: ");
573                 vfprintf(stderr, msg, args);
574                 fputc('\n', stderr);
575                 va_end(args);
576         }
577         fprintf(stderr, "Try `xsv --help' for more information.\n");
578         exit(1);
579 }
580
581 static const char short_options[] = "cd:qr:twW";
582
583 enum long_options {
584         OPT_HELP = 256,
585         OPT_TRIM,
586         OPT_ALWAYS_QUOTE,
587         OPT_TABLE,
588         OPT_TABLE_SEP,
589 };
590
591 static const struct option long_options[] = {
592         { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
593         { "csv",                0,      NULL,   'c' },
594         { "fs",                 1,      NULL,   'd' },
595         { "quiet",              0,      NULL,   'q' },
596         { "regex",              1,      NULL,   'r' },
597         { "strict-ws",          0,      NULL,   'W' },
598         { "table",              0,      NULL,   OPT_TABLE },
599         { "table-sep",          1,      NULL,   OPT_TABLE_SEP },
600         { "trim",               0,      NULL,   OPT_TRIM },
601         { "tsv",                0,      NULL,   't' },
602         { "ws",                 0,      NULL,   'w' },
603         { "help",               0,      NULL,   OPT_HELP },
604         { NULL,                 0,      NULL,   0 },
605 };
606
607 static void set_format(int format_id)
608 {
609         struct format *f = xmalloc_zero(sizeof(*f));
610         f->id = format_id;
611
612         switch (format_id) {
613                 case FORM_TSV:
614                         f->fs = '\t';
615                         f->quote = -1;
616                         f->read_line = csv_read;
617                         f->write_line = csv_write;
618                         break;
619                 case FORM_CSV:
620                         f->fs = ',';
621                         f->quote = '"';
622                         f->read_line = csv_read;
623                         f->write_line = csv_write;
624                         break;
625                 case FORM_WS:
626                         f->fs = ' ';
627                         f->quote = -1;
628                         f->read_line = ws_read;
629                         f->write_line = csv_write;
630                         break;
631                 case FORM_REGEX:
632                         f->read_line = regex_read;
633                         break;
634                 case FORM_TABLE:
635                         f->write_line = table_write;
636                         f->needs_two_passes = 1;
637                         f->table_sep = 2;
638                         break;
639         }
640
641         if (!in_format)
642                 in_format = f;
643         else if (!out_format)
644                 out_format = f;
645         else
646                 bad_args("At most two formats may be given.");
647 }
648
649 static struct format *current_format(void)
650 {
651         if (out_format)
652                 return out_format;
653         if (in_format)
654                 return in_format;
655         set_format(FORM_TSV);
656         return in_format;
657 }
658
659 int main(int argc, char **argv)
660 {
661         int opt;
662         const char *err;
663
664         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
665                 switch (opt) {
666                         case 'c':
667                                 set_format(FORM_CSV);
668                                 break;
669                         case 'd':
670                                 if (optarg[0])
671                                         current_format()->fs = optarg[0];
672                                 else
673                                         bad_args("No field delimiter given.");
674                                 break;
675                         case 'q':
676                                 current_format()->quiet = 1;
677                                 break;
678                         case 'r':
679                                 set_format(FORM_REGEX);
680                                 err = regex_set(current_format(), optarg);
681                                 if (err)
682                                         bad_args("Error compiling regex: %s", err);
683                                 break;
684                         case 't':
685                                 set_format(FORM_TSV);
686                                 break;
687                         case 'w':
688                                 set_format(FORM_WS);
689                                 break;
690                         case 'W':
691                                 set_format(FORM_WS);
692                                 current_format()->strict_ws = 1;
693                                 break;
694                         case OPT_ALWAYS_QUOTE:
695                                 if (current_format()->id != FORM_CSV)
696                                         bad_args("--always-quote makes sense only for CSV.");
697                                 current_format()->always_quote = 1;
698                                 break;
699                         case OPT_HELP:
700                                 usage();
701                         case OPT_TRIM:
702                                 want_trim = 1;
703                                 break;
704                         case OPT_TABLE:
705                                 set_format(FORM_TABLE);
706                                 break;
707                         case OPT_TABLE_SEP:
708                                 current_format()->table_sep = atoi(optarg);
709                                 break;
710                         default:
711                                 bad_args(NULL);
712                 }
713
714         current_format();
715         if (!out_format)
716                 out_format = in_format;
717         if (!in_format->read_line)
718                 bad_args("Write-only format selected for input.");
719         if (!out_format->write_line)
720                 bad_args("Read-only format selected for output.");
721
722         for (int i = optind; i < argc; i++) {
723                 err = parse_selector(argv[i]);
724                 if (err)
725                         bad_args(err);
726         }
727         finish_parse_selectors();
728
729         fields_init(&in_fields);
730         fields_init(&out_fields);
731         line_init(&in_line);
732
733         if (out_format->needs_two_passes)
734                 two_pass();
735         else
736                 one_pass();
737         return 0;
738 }