]> mj.ucw.cz Git - xsv.git/blob - xsv.c
Added two-pass code and --table format
[xsv.git] / xsv.c
1 /*
2  *      A Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <stdarg.h>
11 #include <getopt.h>
12
13 #include <pcre.h>
14
15 /*** Memory allocation ***/
16
17 static void *xmalloc(size_t bytes)
18 {
19         void *p = malloc(bytes);
20         if (!p) {
21                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
22                 exit(1);
23         }
24         return p;
25 }
26
27 static void *xmalloc_zero(size_t bytes)
28 {
29         void *p = xmalloc(bytes);
30         memset(p, 0, bytes);
31         return p;
32 }
33
34 static void *xrealloc(void *old, size_t bytes)
35 {
36         void *p = realloc(old, bytes);
37         if (!p) {
38                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
39                 exit(1);
40         }
41         return p;
42 }
43
44 #define DECLARE_BUF(name, type) \
45         typedef struct { type *start; int count; int max; } name##_t;                           \
46         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
47         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
48         static inline int name##_count(name##_t *b) { return b->count; }                        \
49         static void name##_extend(name##_t *b) {                                                \
50                 b->max = b->max ? 2*b->max : 16;                                                \
51                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
52         }                                                                                       \
53         static inline type *name##_push(name##_t *b) {                                          \
54                 if (b->count >= b->max) name##_extend(b);                                       \
55                 return &b->start[b->count++];                                                   \
56         }                                                                                       \
57         static inline type *name##_first(name##_t *b) { return b->start; }                      \
58         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
59         // end
60
61 DECLARE_BUF(intarray, int);
62
63 /*** Formats and their parameters ***/
64
65 enum format_id {
66         FORM_UNSPEC,
67         FORM_TSV,
68         FORM_CSV,
69         FORM_WS,
70         FORM_REGEX,
71         FORM_TMP,
72         FORM_TABLE,
73 };
74
75 struct format {
76         enum format_id id;
77         int fs;
78         int quote;
79         int quiet;
80         int (*read_line)(void);
81         void (*write_line)(void);
82         int needs_two_passes;
83
84         // CSV backend:
85         int always_quote;
86
87         // WS backend:
88         int strict_ws;
89
90         // regex backend:
91         pcre *pcre;
92         pcre_extra *pcre_extra;
93
94         // Temporary file backend:
95         FILE *tmp_file;
96         intarray_t column_widths;
97
98         // Table backend:
99         int table_sep;
100 };
101
102 static struct format *in_format, *out_format;
103 static int want_trim;
104
105 struct field {
106         int start_pos;
107         int len;
108 };
109
110 DECLARE_BUF(fields, struct field);
111 DECLARE_BUF(line, unsigned char);
112
113 static fields_t in_fields, out_fields;
114 static struct field *in_field;
115 static line_t in_line;
116 static int line_number;
117
118 static void new_field(int pos)
119 {
120         in_field = fields_push(&in_fields);
121         in_field->start_pos = pos;
122         in_field->len = 0;
123 }
124
125 static void ensure_field(int pos)
126 {
127         if (!in_field)
128                 new_field(pos);
129 }
130
131 static void warn(struct format *fmt, char *msg, ...)
132 {
133         if (!fmt->quiet) {
134                 fprintf(stderr, "Warning at line %d: ", line_number);
135                 va_list args;
136                 va_start(args, msg);
137                 vfprintf(stderr, args, msg);
138                 va_end(args);
139                 fputc('\n', stderr);
140         }
141 }
142
143 static int next_line(void)
144 {
145         for (;;) {
146                 int c = getchar();
147                 if (c == '\r')
148                         continue;
149                 if (c < 0)
150                         return !!line_count(&in_line);
151                 if (c == '\n')
152                         return 1;
153                 *line_push(&in_line) = c;
154         }
155 }
156
157 /*** CSV/TSV back-end */
158
159 static int csv_read(void)
160 {
161         int quoted = 0;
162         for (;;) {
163                 int c = getchar();
164                 int i = line_count(&in_line);
165 restart:
166                 if (c == '\r')
167                         continue;
168                 if (c < 0 || c == '\n') {
169                         if (quoted)
170                                 warn(in_format, "Missing closing quote.");
171                         if (c < 0)
172                                 return !!fields_count(&in_fields);
173                         else
174                                 return 1;
175                 }
176                 if (quoted) {
177                         if (c == in_format->quote) {
178                                 c = getchar();
179                                 if (c != in_format->quote) {
180                                         quoted = 0;
181                                         goto restart;
182                                 }
183                                 // Two quotes assimilate to one
184                         }
185                         // Fall through to pushing the character
186                 } else if (c == in_format->quote) {
187                         quoted = 1;
188                         continue;
189                 } else if (c == in_format->fs && !quoted) {
190                         ensure_field(i);
191                         new_field(i);
192                         continue;
193                 }
194                 ensure_field(i);
195                 *line_push(&in_line) = c;
196                 in_field->len++;
197         }
198 }
199
200 static int is_ws(int c)
201 {
202         return (c == ' ' || c == '\t' || c == '\f');
203 }
204
205 static void csv_write(void)
206 {
207         unsigned char *line = line_first(&in_line);
208         int n = fields_count(&out_fields);
209         for (int i=0; i<n; i++) {
210                 struct field *f = fields_nth(&out_fields, i);
211                 int need_quotes = 0;
212                 if (out_format->quote >= 0) {
213                         need_quotes = out_format->always_quote;
214                         for (int j=0; !need_quotes && j < f->len; j++) {
215                                 int c = line[f->start_pos + j];
216                                 if (c == out_format->fs || c == out_format->quote)
217                                         need_quotes = 1;
218                         }
219                 }
220                 if (i)
221                         putchar(out_format->fs);
222                 if (need_quotes)
223                         putchar(out_format->quote);
224                 for (int j=0; j < f->len; j++) {
225                         int c = line[f->start_pos + j];
226                         if (c == out_format->fs && !need_quotes)
227                                 warn(out_format, "Field separator found inside field and quoting is turned off.");
228                         if (c == out_format->quote)
229                                 putchar(c);
230                         putchar(c);
231                 }
232                 if (need_quotes)
233                         putchar(out_format->quote);
234         }
235         putchar('\n');
236 }
237
238 /*** White-space back-end ***/
239
240 static int ws_read(void)
241 {
242         if (!next_line())
243                 return 0;
244
245         unsigned char *line = line_first(&in_line);
246         int n = line_count(&in_line);
247         if (!n)
248                 return 1;
249
250         int ws = 0;
251         new_field(0);
252         for (int i=0; i<n; i++) {
253                 int c = line[i];
254                 if (is_ws(c)) {
255                         ws++;
256                 } else {
257                         if (ws) {
258                                 if (!in_field->start_pos &&
259                                     !in_field->len &&
260                                     !in_format->strict_ws)
261                                         in_field->start_pos = i;
262                                 else
263                                         new_field(i);
264                                 ws = 0;
265                         }
266                         in_field->len++;
267                 }
268         }
269
270         if (ws && in_format->strict_ws)
271                 new_field(n);
272         return 1;
273 }
274
275 /*** Regex back-end ***/
276
277 static const char *regex_set(struct format *f, char *rx)
278 {
279         const char *err;
280         int errpos;
281         f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
282         if (!f->pcre)
283                 return err;
284
285         f->pcre_extra = pcre_study(f->pcre, 0, &err);
286         if (!f->pcre_extra)
287                 return err;
288
289         return NULL;
290 }
291
292 static int regex_read(void)
293 {
294         if (!next_line())
295                 return 0;
296
297         unsigned char *c = line_first(&in_line);
298         int n = line_count(&in_line);
299         if (!n)
300                 return 1;
301
302         int i = 0;
303         for (;;) {
304                 int ovec[3];
305                 int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3);
306                 if (sep < 0) {
307                         if (sep != PCRE_ERROR_NOMATCH)
308                                 warn(in_format, "PCRE matching error %d", sep);
309                         // No further occurrence of the separator: the rest is a single field
310                         new_field(i);
311                         in_field->len = n - i;
312                         return 1;
313                 }
314                 new_field(i);
315                 in_field->len = ovec[0] - i;
316                 i = ovec[1];
317         }
318 }
319
320 /*** Table back-end ***/
321
322 static void table_write(void)
323 {
324         for (int i = 0; i < fields_count(&in_fields); i++) {
325                 if (i)
326                         printf("%*s", out_format->table_sep, "");
327                 struct field *f = fields_nth(&in_fields, i);
328                 int w = *intarray_nth(&in_format->column_widths, i);
329                 if (f->len > w) {
330                         warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", f->len, w);
331                         w = f->len;
332                 }
333                 int j = 0;
334                 unsigned char *p = line_nth(&in_line, f->start_pos);
335                 while (j < f->len) {
336                         putchar(*p++);
337                         j++;
338                 }
339                 while (j < w) {
340                         putchar(' ');
341                         j++;
342                 }
343         }
344         putchar('\n');
345 }
346
347 /*** Temporary file back-end ***/
348
349 static int tmp_read(void)
350 {
351         FILE *tf = in_format->tmp_file;
352
353         for (;;) {
354                 int c = fgetc(tf);
355                 if (c < 0)
356                         return 0;
357                 if (c == 0xff)
358                         return 1;
359                 if (c == 0xfe) {
360                         c = fgetc(tf);
361                         c = (c << 8) | fgetc(tf);
362                         c = (c << 8) | fgetc(tf);
363                         c = (c << 8) | fgetc(tf);
364                 }
365                 new_field(line_count(&in_line));
366                 in_field->len = c;
367                 while (c--) {
368                         int x = fgetc(tf);
369                         if (x < 0) {
370                                 warn(in_format, "Truncated temporary file");
371                                 return 0;
372                         }
373                         *line_push(&in_line) = x;
374                 }
375         }
376 }
377
378 static void tmp_write(void)
379 {
380         FILE *tf = out_format->tmp_file;
381
382         for (int i = 0; i < fields_count(&in_fields); i++) {
383                 struct field *f = fields_nth(&in_fields, i);
384                 if (f->len < 0xfe)
385                         fputc(f->len, tf);
386                 else {
387                         fputc(0xfe, tf);
388                         fputc((f->len >> 24) & 0xff, tf);
389                         fputc((f->len >> 16) & 0xff, tf);
390                         fputc((f->len >> 8) & 0xff, tf);
391                         fputc(f->len & 0xff, tf);
392                 }
393
394                 unsigned char *p = line_nth(&in_line, f->start_pos);
395                 for (int j = 0; j < f->len; j++)
396                         fputc(*p++, tf);
397
398                 intarray_t *w = &out_format->column_widths;
399                 while (i >= intarray_count(w))
400                         *intarray_push(w) = 0;
401                 if (*intarray_nth(w, i) < f->len)
402                         *intarray_nth(w, i) = f->len;
403         }
404         fputc(0xff, tf);
405 }
406
407 /*** Transforms ***/
408
409 static void trim_fields(void)
410 {
411         unsigned char *line = line_first(&in_line);
412         for (int i = 0; i < fields_count(&in_fields); i++) {
413                 struct field *f = fields_nth(&in_fields, i);
414                 while (f->len && is_ws(line[f->start_pos]))
415                         f->start_pos++, f->len--;
416                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
417                         f->len--;
418         }
419 }
420
421 /*** Field selection ***/
422
423 struct selector {
424         int first_field, last_field;
425 };
426
427 DECLARE_BUF(selectors, struct selector);
428 static selectors_t selectors;
429
430 static char *parse_selector(char *str)
431 {
432         char buf[strlen(str) + 1];
433         strcpy(buf, str);
434
435         struct selector *s = selectors_push(&selectors);
436         char *sep = strchr(buf, '-');
437         if (sep) {
438                 *sep++ = 0;
439                 s->first_field = atoi(buf);
440                 s->last_field = atoi(sep);
441         } else
442                 s->first_field = s->last_field = atoi(buf);
443
444         return NULL;
445 }
446
447 static void finish_parse_selectors(void)
448 {
449         if (!selectors_count(&selectors))
450                 parse_selector("-");
451 }
452
453 static void select_fields(void)
454 {
455         for (int i = 0; i < selectors_count(&selectors); i++) {
456                 struct selector *s = selectors_nth(&selectors, i);
457                 int first = s->first_field;
458                 if (first <= 0)
459                         first = 1;
460                 int last = s->last_field;
461                 if (last <= 0)
462                         last = fields_count(&in_fields);
463                 for (int j = first; j <= last; j++) {
464                         struct field *f = fields_push(&out_fields);
465                         if (j >= 1 && j <= fields_count(&in_fields))
466                                 *f = *fields_nth(&in_fields, j-1);
467                         else
468                                 f->start_pos = f->len = 0;
469                 }
470         }
471 }
472
473 /*** Processing of files ***/
474
475 static void one_pass(void)
476 {
477         line_number = 0;
478         for (;;) {
479                 line_number++;
480                 fields_reset(&in_fields);
481                 line_reset(&in_line);
482                 in_field = NULL;
483                 if (!in_format->read_line())
484                         break;
485
486                 if (want_trim)
487                         trim_fields();
488
489                 fields_reset(&out_fields);
490                 select_fields();
491
492                 out_format->write_line();
493         }
494 }
495
496 static void two_pass(void)
497 {
498         struct format *final_format = out_format;
499
500         // Pass 1: Set up writer of intermediate format
501         out_format = xmalloc_zero(sizeof(*out_format));
502         out_format->id = FORM_TMP;
503         out_format->read_line = tmp_read;
504         out_format->write_line = tmp_write;
505         out_format->tmp_file = tmpfile();
506         intarray_init(&out_format->column_widths);
507         one_pass();
508
509         // Pass 2: Set up reader of intermediate format
510         in_format = out_format;
511         rewind(in_format->tmp_file);
512         out_format = final_format;
513         one_pass();
514         fclose(in_format->tmp_file);
515 }
516
517 /*** Parsing of arguments ***/
518
519 static void usage(void)
520 {
521         printf("\
522 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
523 \n\
524 Formats:\n\
525 -t, --tsv               TAB-separated values (default)\n\
526 -c, --csv               Comma-separated values\n\
527 -w, --ws                Values separated by arbitrary whitespace\n\
528 -W, --strict-ws         Like --ws, but recognize empty columns at start/end\n\
529 -r, --regex=<rx>        Separator given by Perl regular expression (input only)\n\
530     --table             Format a table (output only)\n\
531 \n\
532 Format parameters:\n\
533 -d, --fs=<char>         Delimiter of fields\n\
534 -q, --quiet             Do not show warnings\n\
535     --always-quote      Put quotes around all fields (CSV output only)\n\
536     --table-sep=<n>     Separate table columns by <n> spaces (default: 2)\n\
537 \n\
538 Other options:\n\
539     --trim              Trim leading and trailing whitespaces in fields\n\
540 ");
541         exit(0);
542 }
543
544 static void bad_args(const char *msg, ...)
545 {
546         if (msg) {
547                 va_list args;
548                 va_start(args, msg);
549                 fprintf(stderr, "xsv: ");
550                 vfprintf(stderr, msg, args);
551                 fputc('\n', stderr);
552                 va_end(args);
553         }
554         fprintf(stderr, "Try `xsv --help' for more information.\n");
555         exit(1);
556 }
557
558 static const char short_options[] = "cd:qr:twW";
559
560 enum long_options {
561         OPT_HELP = 256,
562         OPT_TRIM,
563         OPT_ALWAYS_QUOTE,
564         OPT_TABLE,
565         OPT_TABLE_SEP,
566 };
567
568 static const struct option long_options[] = {
569         { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
570         { "csv",                0,      NULL,   'c' },
571         { "fs",                 1,      NULL,   'd' },
572         { "quiet",              0,      NULL,   'q' },
573         { "regex",              1,      NULL,   'r' },
574         { "strict-ws",          0,      NULL,   'W' },
575         { "table",              0,      NULL,   OPT_TABLE },
576         { "table-sep",          1,      NULL,   OPT_TABLE_SEP },
577         { "trim",               0,      NULL,   OPT_TRIM },
578         { "tsv",                0,      NULL,   't' },
579         { "ws",                 0,      NULL,   'w' },
580         { "help",               0,      NULL,   OPT_HELP },
581         { NULL,                 0,      NULL,   0 },
582 };
583
584 static void set_format(int format_id)
585 {
586         struct format *f = xmalloc_zero(sizeof(*f));
587         f->id = format_id;
588
589         switch (format_id) {
590                 case FORM_TSV:
591                         f->fs = '\t';
592                         f->quote = -1;
593                         f->read_line = csv_read;
594                         f->write_line = csv_write;
595                         break;
596                 case FORM_CSV:
597                         f->fs = ',';
598                         f->quote = '"';
599                         f->read_line = csv_read;
600                         f->write_line = csv_write;
601                         break;
602                 case FORM_WS:
603                         f->fs = ' ';
604                         f->quote = -1;
605                         f->read_line = ws_read;
606                         f->write_line = csv_write;
607                         break;
608                 case FORM_REGEX:
609                         f->read_line = regex_read;
610                         break;
611                 case FORM_TABLE:
612                         f->write_line = table_write;
613                         f->needs_two_passes = 1;
614                         f->table_sep = 2;
615                         break;
616         }
617
618         if (!in_format)
619                 in_format = f;
620         else if (!out_format)
621                 out_format = f;
622         else
623                 bad_args("At most two formats may be given.");
624 }
625
626 static struct format *current_format(void)
627 {
628         if (out_format)
629                 return out_format;
630         if (in_format)
631                 return in_format;
632         set_format(FORM_TSV);
633         return in_format;
634 }
635
636 int main(int argc, char **argv)
637 {
638         int opt;
639         const char *err;
640
641         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
642                 switch (opt) {
643                         case 'c':
644                                 set_format(FORM_CSV);
645                                 break;
646                         case 'd':
647                                 if (optarg[0])
648                                         current_format()->fs = optarg[0];
649                                 else
650                                         bad_args("No field delimiter given.");
651                                 break;
652                         case 'q':
653                                 current_format()->quiet = 1;
654                                 break;
655                         case 'r':
656                                 set_format(FORM_REGEX);
657                                 err = regex_set(current_format(), optarg);
658                                 if (err)
659                                         bad_args("Error compiling regex: %s", err);
660                                 break;
661                         case 't':
662                                 set_format(FORM_TSV);
663                                 break;
664                         case 'w':
665                                 set_format(FORM_WS);
666                                 break;
667                         case 'W':
668                                 set_format(FORM_WS);
669                                 current_format()->strict_ws = 1;
670                                 break;
671                         case OPT_ALWAYS_QUOTE:
672                                 if (current_format()->id != FORM_CSV)
673                                         bad_args("--always-quote makes sense only for CSV.");
674                                 current_format()->always_quote = 1;
675                                 break;
676                         case OPT_HELP:
677                                 usage();
678                         case OPT_TRIM:
679                                 want_trim = 1;
680                                 break;
681                         case OPT_TABLE:
682                                 set_format(FORM_TABLE);
683                                 break;
684                         case OPT_TABLE_SEP:
685                                 current_format()->table_sep = atoi(optarg);
686                                 break;
687                         default:
688                                 bad_args(NULL);
689                 }
690
691         current_format();
692         if (!out_format)
693                 out_format = in_format;
694         if (!in_format->read_line)
695                 bad_args("Write-only format selected for input.");
696         if (!out_format->write_line)
697                 bad_args("Read-only format selected for output.");
698
699         for (int i = optind; i < argc; i++) {
700                 err = parse_selector(argv[i]);
701                 if (err)
702                         bad_args(err);
703         }
704         finish_parse_selectors();
705
706         fields_init(&in_fields);
707         fields_init(&out_fields);
708         line_init(&in_line);
709
710         if (out_format->needs_two_passes)
711                 two_pass();
712         else
713                 one_pass();
714         return 0;
715 }