]> mj.ucw.cz Git - xsv.git/blob - xsv.c
Use unlocked stdio, it is faster
[xsv.git] / xsv.c
1 /*
2  *      A Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #define _GNU_SOURCE
8
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdarg.h>
13 #include <getopt.h>
14 #include <wchar.h>
15 #include <locale.h>
16
17 #include <pcre.h>
18
19 /*** Memory allocation ***/
20
21 static void *xmalloc(size_t bytes)
22 {
23         void *p = malloc(bytes);
24         if (!p) {
25                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
26                 exit(1);
27         }
28         return p;
29 }
30
31 static void *xmalloc_zero(size_t bytes)
32 {
33         void *p = xmalloc(bytes);
34         memset(p, 0, bytes);
35         return p;
36 }
37
38 static void *xrealloc(void *old, size_t bytes)
39 {
40         void *p = realloc(old, bytes);
41         if (!p) {
42                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
43                 exit(1);
44         }
45         return p;
46 }
47
48 #define DECLARE_BUF(name, type) \
49         typedef struct { type *start; int count; int max; } name##_t;                           \
50         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
51         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
52         static inline int name##_count(name##_t *b) { return b->count; }                        \
53         static void name##_extend(name##_t *b) {                                                \
54                 b->max = b->max ? 2*b->max : 16;                                                \
55                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
56         }                                                                                       \
57         static inline type *name##_push(name##_t *b) {                                          \
58                 if (b->count >= b->max) name##_extend(b);                                       \
59                 return &b->start[b->count++];                                                   \
60         }                                                                                       \
61         static inline type *name##_first(name##_t *b) { return b->start; }                      \
62         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
63         // end
64
65 DECLARE_BUF(intarray, int);
66
67 /*** Formats and their parameters ***/
68
69 enum format_id {
70         FORM_UNSPEC,
71         FORM_TSV,
72         FORM_CSV,
73         FORM_WS,
74         FORM_REGEX,
75         FORM_TMP,
76         FORM_TABLE,
77 };
78
79 struct format {
80         enum format_id id;
81         int fs;
82         int quote;
83         int quiet;
84         int (*read_line)(void);
85         void (*write_line)(void);
86         int needs_two_passes;
87
88         // CSV backend:
89         int always_quote;
90
91         // WS backend:
92         int strict_ws;
93
94         // regex backend:
95         pcre *pcre;
96         pcre_extra *pcre_extra;
97
98         // Temporary file backend:
99         FILE *tmp_file;
100         intarray_t column_widths;
101
102         // Table backend:
103         int table_sep;
104 };
105
106 static struct format *in_format, *out_format;
107 static int want_trim;
108
109 struct field {
110         int start_pos;
111         int len;
112 };
113
114 DECLARE_BUF(fields, struct field);
115 DECLARE_BUF(line, unsigned char);
116
117 static fields_t in_fields, out_fields;
118 static struct field *in_field;
119 static line_t in_line;
120 static int line_number;
121
122 static void new_field(int pos)
123 {
124         in_field = fields_push(&in_fields);
125         in_field->start_pos = pos;
126         in_field->len = 0;
127 }
128
129 static void ensure_field(int pos)
130 {
131         if (!in_field)
132                 new_field(pos);
133 }
134
135 static void warn(struct format *fmt, char *msg, ...)
136 {
137         if (!fmt->quiet) {
138                 fprintf(stderr, "Warning at line %d: ", line_number);
139                 va_list args;
140                 va_start(args, msg);
141                 vfprintf(stderr, msg, args);
142                 va_end(args);
143                 fputc('\n', stderr);
144         }
145 }
146
147 static int next_line(void)
148 {
149         for (;;) {
150                 int c = getchar_unlocked();
151                 if (c == '\r')
152                         continue;
153                 if (c < 0)
154                         return !!line_count(&in_line);
155                 if (c == '\n')
156                         return 1;
157                 *line_push(&in_line) = c;
158         }
159 }
160
161 static int field_chars(struct field *f)
162 {
163         unsigned char *s = line_nth(&in_line, f->start_pos);
164         int i = 0;
165         mbstate_t mbs;
166         memset(&mbs, 0, sizeof(mbs));
167
168         int chars = 0;
169         while (i < f->len) {
170                 size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
171                 if ((int) k <= 0)
172                         break;
173                 i += k;
174                 chars++;
175         }
176
177         return chars;
178 }
179
180 /*** CSV/TSV back-end */
181
182 static int csv_read(void)
183 {
184         int quoted = 0;
185         for (;;) {
186                 int c = getchar_unlocked();
187                 int i = line_count(&in_line);
188 restart:
189                 if (c == '\r')
190                         continue;
191                 if (c < 0 || c == '\n') {
192                         if (quoted)
193                                 warn(in_format, "Missing closing quote.");
194                         if (c < 0)
195                                 return !!fields_count(&in_fields);
196                         else
197                                 return 1;
198                 }
199                 if (quoted) {
200                         if (c == in_format->quote) {
201                                 c = getchar_unlocked();
202                                 if (c != in_format->quote) {
203                                         quoted = 0;
204                                         goto restart;
205                                 }
206                                 // Two quotes assimilate to one
207                         }
208                         // Fall through to pushing the character
209                 } else if (c == in_format->quote) {
210                         quoted = 1;
211                         continue;
212                 } else if (c == in_format->fs && !quoted) {
213                         ensure_field(i);
214                         new_field(i);
215                         continue;
216                 }
217                 ensure_field(i);
218                 *line_push(&in_line) = c;
219                 in_field->len++;
220         }
221 }
222
223 static int is_ws(int c)
224 {
225         return (c == ' ' || c == '\t' || c == '\f');
226 }
227
228 static void csv_write(void)
229 {
230         unsigned char *line = line_first(&in_line);
231         int n = fields_count(&out_fields);
232         for (int i=0; i<n; i++) {
233                 struct field *f = fields_nth(&out_fields, i);
234                 int need_quotes = 0;
235                 if (out_format->quote >= 0) {
236                         need_quotes = out_format->always_quote;
237                         for (int j=0; !need_quotes && j < f->len; j++) {
238                                 int c = line[f->start_pos + j];
239                                 if (c == out_format->fs || c == out_format->quote)
240                                         need_quotes = 1;
241                         }
242                 }
243                 if (i)
244                         putchar_unlocked(out_format->fs);
245                 if (need_quotes)
246                         putchar_unlocked(out_format->quote);
247                 for (int j=0; j < f->len; j++) {
248                         int c = line[f->start_pos + j];
249                         if (c == out_format->fs && !need_quotes)
250                                 warn(out_format, "Field separator found inside field and quoting is turned off.");
251                         if (c == out_format->quote)
252                                 putchar_unlocked(c);
253                         putchar_unlocked(c);
254                 }
255                 if (need_quotes)
256                         putchar_unlocked(out_format->quote);
257         }
258         putchar_unlocked('\n');
259 }
260
261 /*** White-space back-end ***/
262
263 static int ws_read(void)
264 {
265         if (!next_line())
266                 return 0;
267
268         unsigned char *line = line_first(&in_line);
269         int n = line_count(&in_line);
270         if (!n)
271                 return 1;
272
273         int ws = 0;
274         new_field(0);
275         for (int i=0; i<n; i++) {
276                 int c = line[i];
277                 if (is_ws(c)) {
278                         ws++;
279                 } else {
280                         if (ws) {
281                                 if (!in_field->start_pos &&
282                                     !in_field->len &&
283                                     !in_format->strict_ws)
284                                         in_field->start_pos = i;
285                                 else
286                                         new_field(i);
287                                 ws = 0;
288                         }
289                         in_field->len++;
290                 }
291         }
292
293         if (ws && in_format->strict_ws)
294                 new_field(n);
295         return 1;
296 }
297
298 /*** Regex back-end ***/
299
300 static const char *regex_set(struct format *f, char *rx)
301 {
302         const char *err;
303         int errpos;
304         f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
305         if (!f->pcre)
306                 return err;
307
308         f->pcre_extra = pcre_study(f->pcre, 0, &err);
309         if (!f->pcre_extra)
310                 return err;
311
312         return NULL;
313 }
314
315 static int regex_read(void)
316 {
317         if (!next_line())
318                 return 0;
319
320         unsigned char *c = line_first(&in_line);
321         int n = line_count(&in_line);
322         if (!n)
323                 return 1;
324
325         int i = 0;
326         for (;;) {
327                 int ovec[3];
328                 int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3);
329                 if (sep < 0) {
330                         if (sep != PCRE_ERROR_NOMATCH)
331                                 warn(in_format, "PCRE matching error %d", sep);
332                         // No further occurrence of the separator: the rest is a single field
333                         new_field(i);
334                         in_field->len = n - i;
335                         return 1;
336                 }
337                 new_field(i);
338                 in_field->len = ovec[0] - i;
339                 i = ovec[1];
340         }
341 }
342
343 /*** Table back-end ***/
344
345 static void table_write(void)
346 {
347         for (int i = 0; i < fields_count(&in_fields); i++) {
348                 if (i)
349                         printf("%*s", out_format->table_sep, "");
350                 struct field *f = fields_nth(&in_fields, i);
351                 int fw = field_chars(f);
352                 int cw = *intarray_nth(&in_format->column_widths, i);
353                 if (fw > cw) {
354                         warn(out_format, "Internal error: Wrongly calculated column width (%d > %d)", fw, cw);
355                         cw = fw;
356                 }
357                 unsigned char *p = line_nth(&in_line, f->start_pos);
358                 for (int j = 0; j < f->len; j++)
359                         putchar_unlocked(p[j]);
360                 while (fw < cw) {
361                         putchar_unlocked(' ');
362                         fw++;
363                 }
364         }
365         putchar_unlocked('\n');
366 }
367
368 /*** Temporary file back-end ***/
369
370 static int tmp_read(void)
371 {
372         FILE *tf = in_format->tmp_file;
373
374         for (;;) {
375                 int c = getc_unlocked(tf);
376                 if (c < 0)
377                         return 0;
378                 if (c == 0xff)
379                         return 1;
380                 if (c == 0xfe) {
381                         c = getc_unlocked(tf);
382                         c = (c << 8) | getc_unlocked(tf);
383                         c = (c << 8) | getc_unlocked(tf);
384                         c = (c << 8) | getc_unlocked(tf);
385                 }
386                 new_field(line_count(&in_line));
387                 in_field->len = c;
388                 while (c--) {
389                         int x = getc_unlocked(tf);
390                         if (x < 0) {
391                                 warn(in_format, "Truncated temporary file");
392                                 return 0;
393                         }
394                         *line_push(&in_line) = x;
395                 }
396         }
397 }
398
399 static void tmp_write(void)
400 {
401         FILE *tf = out_format->tmp_file;
402
403         for (int i = 0; i < fields_count(&in_fields); i++) {
404                 struct field *f = fields_nth(&in_fields, i);
405                 if (f->len < 0xfe)
406                         putc_unlocked(f->len, tf);
407                 else {
408                         putc_unlocked(0xfe, tf);
409                         putc_unlocked((f->len >> 24) & 0xff, tf);
410                         putc_unlocked((f->len >> 16) & 0xff, tf);
411                         putc_unlocked((f->len >> 8) & 0xff, tf);
412                         putc_unlocked(f->len & 0xff, tf);
413                 }
414
415                 unsigned char *p = line_nth(&in_line, f->start_pos);
416                 for (int j = 0; j < f->len; j++)
417                         putc_unlocked(*p++, tf);
418
419                 intarray_t *w = &out_format->column_widths;
420                 while (i >= intarray_count(w))
421                         *intarray_push(w) = 0;
422                 int fw = field_chars(f);
423                 if (*intarray_nth(w, i) < fw)
424                         *intarray_nth(w, i) = fw;
425         }
426         putc_unlocked(0xff, tf);
427 }
428
429 /*** Transforms ***/
430
431 static void trim_fields(void)
432 {
433         unsigned char *line = line_first(&in_line);
434         for (int i = 0; i < fields_count(&in_fields); i++) {
435                 struct field *f = fields_nth(&in_fields, i);
436                 while (f->len && is_ws(line[f->start_pos]))
437                         f->start_pos++, f->len--;
438                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
439                         f->len--;
440         }
441 }
442
443 /*** Field selection ***/
444
445 struct selector {
446         int first_field, last_field;
447 };
448
449 DECLARE_BUF(selectors, struct selector);
450 static selectors_t selectors;
451
452 static char *parse_selector(char *str)
453 {
454         char buf[strlen(str) + 1];
455         strcpy(buf, str);
456
457         struct selector *s = selectors_push(&selectors);
458         char *sep = strchr(buf, '-');
459         if (sep) {
460                 *sep++ = 0;
461                 s->first_field = atoi(buf);
462                 s->last_field = atoi(sep);
463         } else
464                 s->first_field = s->last_field = atoi(buf);
465
466         return NULL;
467 }
468
469 static void finish_parse_selectors(void)
470 {
471         if (!selectors_count(&selectors))
472                 parse_selector("-");
473 }
474
475 static void select_fields(void)
476 {
477         for (int i = 0; i < selectors_count(&selectors); i++) {
478                 struct selector *s = selectors_nth(&selectors, i);
479                 int first = s->first_field;
480                 if (first <= 0)
481                         first = 1;
482                 int last = s->last_field;
483                 if (last <= 0)
484                         last = fields_count(&in_fields);
485                 for (int j = first; j <= last; j++) {
486                         struct field *f = fields_push(&out_fields);
487                         if (j >= 1 && j <= fields_count(&in_fields))
488                                 *f = *fields_nth(&in_fields, j-1);
489                         else
490                                 f->start_pos = f->len = 0;
491                 }
492         }
493 }
494
495 /*** Processing of files ***/
496
497 static void one_pass(void)
498 {
499         line_number = 0;
500         for (;;) {
501                 line_number++;
502                 fields_reset(&in_fields);
503                 line_reset(&in_line);
504                 in_field = NULL;
505                 if (!in_format->read_line())
506                         break;
507
508                 if (want_trim)
509                         trim_fields();
510
511                 fields_reset(&out_fields);
512                 select_fields();
513
514                 out_format->write_line();
515         }
516 }
517
518 static void two_pass(void)
519 {
520         struct format *final_format = out_format;
521
522         // We need to use character set info from the current locale
523         setlocale(LC_CTYPE, "");
524
525         // Pass 1: Set up writer of intermediate format
526         out_format = xmalloc_zero(sizeof(*out_format));
527         out_format->id = FORM_TMP;
528         out_format->read_line = tmp_read;
529         out_format->write_line = tmp_write;
530         out_format->tmp_file = tmpfile();
531         intarray_init(&out_format->column_widths);
532         one_pass();
533
534         // Pass 2: Set up reader of intermediate format
535         in_format = out_format;
536         rewind(in_format->tmp_file);
537         out_format = final_format;
538         one_pass();
539         fclose(in_format->tmp_file);
540 }
541
542 /*** Parsing of arguments ***/
543
544 static void usage(void)
545 {
546         printf("\
547 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
548 \n\
549 Formats:\n\
550 -t, --tsv               TAB-separated values (default)\n\
551 -c, --csv               Comma-separated values\n\
552 -w, --ws                Values separated by arbitrary whitespace\n\
553 -W, --strict-ws         Like --ws, but recognize empty columns at start/end\n\
554 -r, --regex=<rx>        Separator given by Perl regular expression (input only)\n\
555     --table             Format a table (output only)\n\
556 \n\
557 Format parameters:\n\
558 -d, --fs=<char>         Delimiter of fields\n\
559 -q, --quiet             Do not show warnings\n\
560     --always-quote      Put quotes around all fields (CSV output only)\n\
561     --table-sep=<n>     Separate table columns by <n> spaces (default: 2)\n\
562 \n\
563 Other options:\n\
564     --trim              Trim leading and trailing whitespaces in fields\n\
565 ");
566         exit(0);
567 }
568
569 static void bad_args(const char *msg, ...)
570 {
571         if (msg) {
572                 va_list args;
573                 va_start(args, msg);
574                 fprintf(stderr, "xsv: ");
575                 vfprintf(stderr, msg, args);
576                 fputc('\n', stderr);
577                 va_end(args);
578         }
579         fprintf(stderr, "Try `xsv --help' for more information.\n");
580         exit(1);
581 }
582
583 static const char short_options[] = "cd:qr:twW";
584
585 enum long_options {
586         OPT_HELP = 256,
587         OPT_TRIM,
588         OPT_ALWAYS_QUOTE,
589         OPT_TABLE,
590         OPT_TABLE_SEP,
591 };
592
593 static const struct option long_options[] = {
594         { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
595         { "csv",                0,      NULL,   'c' },
596         { "fs",                 1,      NULL,   'd' },
597         { "quiet",              0,      NULL,   'q' },
598         { "regex",              1,      NULL,   'r' },
599         { "strict-ws",          0,      NULL,   'W' },
600         { "table",              0,      NULL,   OPT_TABLE },
601         { "table-sep",          1,      NULL,   OPT_TABLE_SEP },
602         { "trim",               0,      NULL,   OPT_TRIM },
603         { "tsv",                0,      NULL,   't' },
604         { "ws",                 0,      NULL,   'w' },
605         { "help",               0,      NULL,   OPT_HELP },
606         { NULL,                 0,      NULL,   0 },
607 };
608
609 static void set_format(int format_id)
610 {
611         struct format *f = xmalloc_zero(sizeof(*f));
612         f->id = format_id;
613
614         switch (format_id) {
615                 case FORM_TSV:
616                         f->fs = '\t';
617                         f->quote = -1;
618                         f->read_line = csv_read;
619                         f->write_line = csv_write;
620                         break;
621                 case FORM_CSV:
622                         f->fs = ',';
623                         f->quote = '"';
624                         f->read_line = csv_read;
625                         f->write_line = csv_write;
626                         break;
627                 case FORM_WS:
628                         f->fs = ' ';
629                         f->quote = -1;
630                         f->read_line = ws_read;
631                         f->write_line = csv_write;
632                         break;
633                 case FORM_REGEX:
634                         f->read_line = regex_read;
635                         break;
636                 case FORM_TABLE:
637                         f->write_line = table_write;
638                         f->needs_two_passes = 1;
639                         f->table_sep = 2;
640                         break;
641         }
642
643         if (!in_format)
644                 in_format = f;
645         else if (!out_format)
646                 out_format = f;
647         else
648                 bad_args("At most two formats may be given.");
649 }
650
651 static struct format *current_format(void)
652 {
653         if (out_format)
654                 return out_format;
655         if (in_format)
656                 return in_format;
657         set_format(FORM_TSV);
658         return in_format;
659 }
660
661 int main(int argc, char **argv)
662 {
663         int opt;
664         const char *err;
665
666         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
667                 switch (opt) {
668                         case 'c':
669                                 set_format(FORM_CSV);
670                                 break;
671                         case 'd':
672                                 if (optarg[0])
673                                         current_format()->fs = optarg[0];
674                                 else
675                                         bad_args("No field delimiter given.");
676                                 break;
677                         case 'q':
678                                 current_format()->quiet = 1;
679                                 break;
680                         case 'r':
681                                 set_format(FORM_REGEX);
682                                 err = regex_set(current_format(), optarg);
683                                 if (err)
684                                         bad_args("Error compiling regex: %s", err);
685                                 break;
686                         case 't':
687                                 set_format(FORM_TSV);
688                                 break;
689                         case 'w':
690                                 set_format(FORM_WS);
691                                 break;
692                         case 'W':
693                                 set_format(FORM_WS);
694                                 current_format()->strict_ws = 1;
695                                 break;
696                         case OPT_ALWAYS_QUOTE:
697                                 if (current_format()->id != FORM_CSV)
698                                         bad_args("--always-quote makes sense only for CSV.");
699                                 current_format()->always_quote = 1;
700                                 break;
701                         case OPT_HELP:
702                                 usage();
703                         case OPT_TRIM:
704                                 want_trim = 1;
705                                 break;
706                         case OPT_TABLE:
707                                 set_format(FORM_TABLE);
708                                 break;
709                         case OPT_TABLE_SEP:
710                                 current_format()->table_sep = atoi(optarg);
711                                 break;
712                         default:
713                                 bad_args(NULL);
714                 }
715
716         current_format();
717         if (!out_format)
718                 out_format = in_format;
719         if (!in_format->read_line)
720                 bad_args("Write-only format selected for input.");
721         if (!out_format->write_line)
722                 bad_args("Read-only format selected for output.");
723
724         for (int i = optind; i < argc; i++) {
725                 err = parse_selector(argv[i]);
726                 if (err)
727                         bad_args(err);
728         }
729         finish_parse_selectors();
730
731         fields_init(&in_fields);
732         fields_init(&out_fields);
733         line_init(&in_line);
734
735         if (out_format->needs_two_passes)
736                 two_pass();
737         else
738                 one_pass();
739         return 0;
740 }