]> mj.ucw.cz Git - xsv.git/blob - xsv.c
README: Suggestions etc.
[xsv.git] / xsv.c
1 /*
2  *      The Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #define _GNU_SOURCE
8
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdarg.h>
13 #include <getopt.h>
14 #include <wchar.h>
15 #include <locale.h>
16
17 #include <pcre.h>
18
19 #ifdef __GNUC__
20 #define NONRET __attribute__((noreturn))
21 #define UNUSED __attribute__((unused))
22 #else
23 #define NONRET
24 #define UNUSED
25 #endif
26
27 static void select_fields(void);
28 static void select_all_fields(void);
29
30 /*** General functions ***/
31
32 static void NONRET die(char *msg, ...)
33 {
34         va_list args;
35         va_start(args, msg);
36         fprintf(stderr, "xsv: ");
37         vfprintf(stderr, msg, args);
38         fputc('\n', stderr);
39         va_end(args);
40         exit(1);
41 }
42
43 /*** Memory allocation ***/
44
45 static void *xmalloc(size_t bytes)
46 {
47         void *p = malloc(bytes);
48         if (!p)
49                 die("Out of memory (cannot allocate %zu bytes)", bytes);
50         return p;
51 }
52
53 static void *xmalloc_zero(size_t bytes)
54 {
55         void *p = xmalloc(bytes);
56         memset(p, 0, bytes);
57         return p;
58 }
59
60 static void *xrealloc(void *old, size_t bytes)
61 {
62         void *p = realloc(old, bytes);
63         if (!p)
64                 die("Out of memory (cannot allocate %zu bytes)", bytes);
65         return p;
66 }
67
68 #define DECLARE_BUF(name, type) \
69         typedef struct { type *start; int count; int max; } name##_t;                           \
70         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
71         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
72         static inline int name##_count(name##_t *b) { return b->count; }                        \
73         static void name##_extend(name##_t *b) {                                                \
74                 b->max = b->max ? 2*b->max : 16;                                                \
75                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
76         }                                                                                       \
77         static inline type *name##_push(name##_t *b) {                                          \
78                 if (b->count >= b->max) name##_extend(b);                                       \
79                 return &b->start[b->count++];                                                   \
80         }                                                                                       \
81         static inline type *name##_first(name##_t *b) { return b->start; }                      \
82         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
83         // end
84
85 DECLARE_BUF(intarray, int);
86 DECLARE_BUF(stringarray, char *);
87
88 /*** Formats and their parameters ***/
89
90 enum format_id {
91         FORM_UNSPEC,
92         FORM_TSV,
93         FORM_CSV,
94         FORM_WS,
95         FORM_REGEX,
96         FORM_TMP,
97         FORM_TABLE,
98 };
99
100 struct format {
101         enum format_id id;
102         int fs;
103         int quote;
104         int quiet;
105         int sloppy;
106         int (*read_line)(struct format *fmt);
107         void (*write_line)(struct format *fmt);
108         void (*write_grid)(struct format *fmt, int pos);        // -1=above, 1=below, 0=after header
109         int needs_stats;
110
111         // Field names
112         int has_header;
113         char *set_field_names;
114         struct field_names *field_names;
115
116         // CSV backend:
117         int always_quote;
118
119         // regex backend:
120         pcre *pcre;
121         pcre_extra *pcre_extra;
122
123         // Temporary file backend:
124         FILE *tmp_file;
125
126         // Table backend:
127         int table_sep;
128         int table_grid;
129 };
130
131 static struct format *in_format, *out_format;
132 static int want_trim, want_equalize, want_stats;
133
134 struct field {
135         int start_pos;
136         int len;
137 };
138
139 DECLARE_BUF(fields, struct field);
140 DECLARE_BUF(line, unsigned char);
141
142 static fields_t in_fields, out_fields;
143 static struct field *in_field;
144 static line_t in_line;
145 static int line_number;
146
147 static int read_line(void)
148 {
149         fields_reset(&in_fields);
150         line_reset(&in_line);
151         in_field = NULL;
152         if (!in_format->read_line(in_format))
153                 return 0;
154         if (ferror_unlocked(stdin))
155                 die("I/O error when reading standard input");
156         return 1;
157 }
158
159 static void write_line(void)
160 {
161         out_format->write_line(out_format);
162         if (ferror_unlocked(stdout))
163                 die("I/O error when writing standard input");
164 }
165
166 static void write_grid(int pos)
167 {
168         if (out_format->write_grid) {
169                 out_format->write_grid(out_format, pos);
170                 if (ferror_unlocked(stdout))
171                         die("I/O error when writing standard input");
172         }
173 }
174
175 static void new_field(int pos)
176 {
177         in_field = fields_push(&in_fields);
178         in_field->start_pos = pos;
179         in_field->len = 0;
180 }
181
182 static void ensure_field(int pos)
183 {
184         if (!in_field)
185                 new_field(pos);
186 }
187
188 static unsigned char *get_field(fields_t *fields, int i, int *len)
189 {
190         struct field *f = fields_nth(fields, i);
191         *len = f->len;
192         return line_nth(&in_line, f->start_pos);
193 }
194
195 static void warn(struct format *fmt, char *msg, ...)
196 {
197         if (!fmt->quiet) {
198                 fprintf(stderr, "Warning at line %d: ", line_number);
199                 va_list args;
200                 va_start(args, msg);
201                 vfprintf(stderr, msg, args);
202                 va_end(args);
203                 fputc('\n', stderr);
204         }
205 }
206
207 static int next_line(void)
208 {
209         for (;;) {
210                 int c = getchar_unlocked();
211                 if (c == '\r')
212                         continue;
213                 if (c < 0)
214                         return !!line_count(&in_line);
215                 if (c == '\n')
216                         return 1;
217                 *line_push(&in_line) = c;
218         }
219 }
220
221 static int field_chars(struct field *f)
222 {
223         unsigned char *s = line_nth(&in_line, f->start_pos);
224         int i = 0;
225         mbstate_t mbs;
226         memset(&mbs, 0, sizeof(mbs));
227
228         int chars = 0;
229         while (i < f->len) {
230                 size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
231                 if ((int) k <= 0)
232                         break;
233                 i += k;
234                 chars++;
235         }
236
237         return chars;
238 }
239
240 /*** Field statistics ***/
241
242 static intarray_t column_widths;
243
244 static void update_stats(void)
245 {
246         if (!want_stats)
247                 return;
248
249         for (int i = 0; i < fields_count(&out_fields); i++) {
250                 struct field *f = fields_nth(&out_fields, i);
251                 intarray_t *w = &column_widths;
252
253                 while (i >= intarray_count(w))
254                         *intarray_push(w) = 0;
255                 int fw = field_chars(f);
256                 if (*intarray_nth(w, i) < fw)
257                         *intarray_nth(w, i) = fw;
258         }
259 }
260
261 /*** CSV/TSV back-end */
262
263 static int csv_read(struct format *fmt)
264 {
265         int quoted = 0;
266         for (;;) {
267                 int c = getchar_unlocked();
268                 int i = line_count(&in_line);
269 restart:
270                 if (c == '\r')
271                         continue;
272                 if (c < 0 || c == '\n') {
273                         if (quoted)
274                                 warn(fmt, "Missing closing quote.");
275                         if (c < 0)
276                                 return !!fields_count(&in_fields);
277                         else
278                                 return 1;
279                 }
280                 if (quoted) {
281                         if (c == fmt->quote) {
282                                 c = getchar_unlocked();
283                                 if (c != fmt->quote) {
284                                         quoted = 0;
285                                         goto restart;
286                                 }
287                                 // Two quotes assimilate to one
288                         }
289                         // Fall through to pushing the character
290                 } else if (c == fmt->quote) {
291                         quoted = 1;
292                         continue;
293                 } else if (c == fmt->fs && !quoted) {
294                         ensure_field(i);
295                         new_field(i);
296                         continue;
297                 }
298                 ensure_field(i);
299                 *line_push(&in_line) = c;
300                 in_field->len++;
301         }
302 }
303
304 static int is_ws(int c)
305 {
306         return (c == ' ' || c == '\t' || c == '\f');
307 }
308
309 static void csv_write(struct format *fmt)
310 {
311         for (int i=0; i < fields_count(&out_fields); i++) {
312                 int len;
313                 unsigned char *p = get_field(&out_fields, i, &len);
314
315                 int need_quotes = 0;
316                 if (fmt->quote >= 0) {
317                         need_quotes = fmt->always_quote;
318                         for (int j=0; !need_quotes && j < len; j++) {
319                                 if (p[j] == fmt->fs || p[j] == fmt->quote)
320                                         need_quotes = 1;
321                         }
322                 }
323                 if (i)
324                         putchar_unlocked(fmt->fs);
325                 if (need_quotes)
326                         putchar_unlocked(fmt->quote);
327                 for (int j=0; j < len; j++) {
328                         int c = p[j];
329                         if (c == fmt->fs && !need_quotes)
330                                 warn(fmt, "Field separator found inside field and quoting is turned off.");
331                         if (c == fmt->quote)
332                                 putchar_unlocked(c);
333                         putchar_unlocked(c);
334                 }
335                 if (need_quotes)
336                         putchar_unlocked(fmt->quote);
337         }
338         putchar_unlocked('\n');
339 }
340
341 /*** White-space back-end ***/
342
343 static int ws_read(struct format *fmt)
344 {
345         if (!next_line())
346                 return 0;
347
348         unsigned char *line = line_first(&in_line);
349         int n = line_count(&in_line);
350         if (!n)
351                 return 1;
352
353         int ws = 0;
354         new_field(0);
355         for (int i=0; i<n; i++) {
356                 int c = line[i];
357                 if (is_ws(c)) {
358                         ws++;
359                 } else {
360                         if (ws) {
361                                 if (!in_field->start_pos &&
362                                     !in_field->len &&
363                                     fmt->sloppy)
364                                         in_field->start_pos = i;
365                                 else
366                                         new_field(i);
367                                 ws = 0;
368                         }
369                         in_field->len++;
370                 }
371         }
372
373         if (ws && !fmt->sloppy)
374                 new_field(n);
375         return 1;
376 }
377
378 /*** Regex back-end ***/
379
380 static const char *regex_set(struct format *f, char *rx)
381 {
382         const char *err;
383         int errpos;
384         f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
385         if (!f->pcre)
386                 return err;
387
388         f->pcre_extra = pcre_study(f->pcre, 0, &err);
389         if (!f->pcre_extra)
390                 return err;
391
392         return NULL;
393 }
394
395 static int regex_read(struct format *fmt)
396 {
397         if (!next_line())
398                 return 0;
399
400         unsigned char *c = line_first(&in_line);
401         int n = line_count(&in_line);
402         if (!n)
403                 return 1;
404
405         int i = 0;
406         for (;;) {
407                 int ovec[3];
408                 int err = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
409                 if (err < 0) {
410                         if (err != PCRE_ERROR_NOMATCH)
411                                 warn(fmt, "PCRE matching error %d", err);
412                         // No further occurrence of the separator: the rest is a single field
413                         if (!fmt->sloppy || i < n) {
414                                 new_field(i);
415                                 in_field->len = n - i;
416                         }
417                         return 1;
418                 }
419                 if (ovec[0] == ovec[1]) {
420                         warn(fmt, "Regular expression matched an empty separator.");
421                         new_field(i);
422                         in_field->len = n - i;
423                         return 1;
424                 }
425                 if (!fmt->sloppy || ovec[0]) {
426                         new_field(i);
427                         in_field->len = ovec[0] - i;
428                 }
429                 i = ovec[1];
430         }
431 }
432
433 /*** Table back-end ***/
434
435 static void table_write(struct format *fmt)
436 {
437         for (int i = 0; i < intarray_count(&column_widths); i++) {
438                 if (fmt->table_grid) {
439                         putchar_unlocked('|');
440                         printf("%*s", fmt->table_sep / 2, "");
441                 } else if (i)
442                         printf("%*s", fmt->table_sep, "");
443
444                 int cw = *intarray_nth(&column_widths, i);
445                 int fw = 0;
446                 if (i < fields_count(&out_fields)) {
447                         int len;
448                         unsigned char *p = get_field(&out_fields, i, &len);
449                         fw = field_chars(fields_nth(&out_fields, i));
450                         if (fw > cw) {
451                                 warn(fmt, "Internal error: Wrongly calculated width of column %d (%d > %d)", i, fw, cw);
452                                 cw = fw;
453                         }
454                         while (len--)
455                                 putchar(*p++);
456                 }
457                 while (fw < cw) {
458                         putchar_unlocked(' ');
459                         fw++;
460                 }
461
462                 if (fmt->table_grid)
463                         printf("%*s", fmt->table_sep - fmt->table_sep / 2, "");
464         }
465
466         if (fmt->table_grid)
467                 putchar_unlocked('|');
468         putchar_unlocked('\n');
469 }
470
471 static void table_write_grid(struct format *fmt, int pos UNUSED)
472 {
473         if (!fmt->table_grid)
474                 return;
475
476         for (int i = 0; i < intarray_count(&column_widths); i++) {
477                 putchar_unlocked('+');
478                 int w = fmt->table_sep + *intarray_nth(&column_widths, i);
479                 while (w--)
480                         putchar('-');
481         }
482         putchar_unlocked('+');
483         putchar_unlocked('\n');
484 }
485
486 /*** Temporary file back-end ***/
487
488 static int tmp_read(struct format *fmt)
489 {
490         FILE *tf = fmt->tmp_file;
491
492         for (;;) {
493                 int c = getc_unlocked(tf);
494                 if (c < 0)
495                         return 0;
496                 if (c == 0xff)
497                         return 1;
498                 if (c == 0xfe) {
499                         c = getc_unlocked(tf);
500                         c = (c << 8) | getc_unlocked(tf);
501                         c = (c << 8) | getc_unlocked(tf);
502                         c = (c << 8) | getc_unlocked(tf);
503                 }
504                 new_field(line_count(&in_line));
505                 in_field->len = c;
506                 while (c--) {
507                         int x = getc_unlocked(tf);
508                         if (x < 0)
509                                 die("Truncated temporary file");
510                         *line_push(&in_line) = x;
511                 }
512         }
513
514         if (ferror_unlocked(tf))
515                 die("I/O error when reading temporary file");
516 }
517
518 static void tmp_write(struct format *fmt)
519 {
520         FILE *tf = fmt->tmp_file;
521
522         for (int i = 0; i < fields_count(&out_fields); i++) {
523                 int len;
524                 unsigned char *p = get_field(&out_fields, i, &len);
525
526                 if (len < 0xfe)
527                         putc_unlocked(len, tf);
528                 else {
529                         putc_unlocked(0xfe, tf);
530                         putc_unlocked((len >> 24) & 0xff, tf);
531                         putc_unlocked((len >> 16) & 0xff, tf);
532                         putc_unlocked((len >> 8) & 0xff, tf);
533                         putc_unlocked(len & 0xff, tf);
534                 }
535
536                 while (len--)
537                         putc_unlocked(*p++, tf);
538         }
539         putc_unlocked(0xff, tf);
540
541         if (ferror_unlocked(tf))
542                 die("I/O error when writing temporary file");
543 }
544
545 /*** Transforms ***/
546
547 static void trim_fields(void)
548 {
549         unsigned char *line = line_first(&in_line);
550         for (int i = 0; i < fields_count(&in_fields); i++) {
551                 struct field *f = fields_nth(&in_fields, i);
552                 while (f->len && is_ws(line[f->start_pos]))
553                         f->start_pos++, f->len--;
554                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
555                         f->len--;
556         }
557 }
558
559 static void equalize_fields(void)
560 {
561         while (fields_count(&out_fields) < intarray_count(&column_widths)) {
562                 struct field *f = fields_push(&out_fields);
563                 f->start_pos = f->len = 0;
564         }
565 }
566
567 /*** Field names and headers ***/
568
569 struct field_names {
570         stringarray_t names;
571 };
572
573 static void add_field(struct field_names *fn, char *name, int namelen)
574 {
575         char *n = xmalloc(namelen + 1);
576         memcpy(n, name, namelen);
577         n[namelen] = 0;
578         *stringarray_push(&fn->names) = n;
579 }
580
581 static void add_field_names(struct field_names *fn, char *names)
582 {
583         char *p = names;
584         while (p) {
585                 char *q = strchr(p, ',');
586                 int len = q ? q-p : (int) strlen(p);
587                 add_field(fn, p, len);
588                 p = q ? q+1 : NULL;
589         }
590 }
591
592 static void read_header(void)
593 {
594         if (!(in_format->has_header || in_format->set_field_names))
595                 return;
596
597         struct field_names *fn = xmalloc_zero(sizeof(*fn));
598         in_format->field_names = fn;
599
600         if (in_format->has_header) {
601                 if (!read_line())
602                         die("Missing input header");
603         }
604
605         if (in_format->set_field_names) {
606                 add_field_names(fn, in_format->set_field_names);
607         } else {
608                 for (int i = 0; i < fields_count(&in_fields); i++) {
609                         int len;
610                         char *s = (char *) get_field(&in_fields, i, &len);
611                         add_field(fn, s, len);
612                 }
613         }
614 }
615
616 static void write_header(void)
617 {
618         if (!out_format->has_header) {
619                 write_grid(-1);
620                 return;
621         }
622
623         int want_select_fields = 0;
624         if (out_format->set_field_names) {
625                 struct field_names *fn = xmalloc_zero(sizeof(*fn));
626                 out_format->field_names = fn;
627                 add_field_names(fn, out_format->set_field_names);
628         } else if (in_format->field_names) {
629                 out_format->field_names = in_format->field_names;
630                 want_select_fields = 1;
631         } else
632                 die("Output header requested, but no field names specified");
633
634         line_reset(&in_line);
635         fields_reset(&in_fields);
636         struct field_names *fn = out_format->field_names;
637         for (int i = 0; i < stringarray_count(&fn->names); i++) {
638                 struct field *f = fields_push(&in_fields);
639                 f->start_pos = line_count(&in_line);
640                 f->len = 0;
641                 char *s = *stringarray_nth(&fn->names, i);
642                 while (*s) {
643                         *line_push(&in_line) = *s++;
644                         f->len++;
645                 }
646         }
647
648         fields_reset(&out_fields);
649         if (want_select_fields)
650                 select_fields();
651         else
652                 select_all_fields();
653
654         // This is tricky: when we are formatting a table, field names are normally
655         // calculated in pass 1, but the header is written in pass 2, so we have to
656         // update column statistics, because field name can be too wide to fit.
657         want_stats++;
658         update_stats();
659         want_stats--;
660         if (want_equalize)
661                 equalize_fields();
662         write_grid(-1);
663         write_line();
664         write_grid(0);
665 }
666
667 static void write_footer(void)
668 {
669         write_grid(1);
670 }
671
672 static int find_field_by_name(struct field_names *fn, char *name)
673 {
674         for (int i = 0; i < stringarray_count(&fn->names); i++)
675                 if (!strcmp(*stringarray_nth(&fn->names, i), name))
676                         return i + 1;
677         return -1;
678 }
679
680 /*** Field selection ***/
681
682 struct selector {
683         int first_field, last_field;            // 0 means "boundary"
684 };
685
686 DECLARE_BUF(selectors, struct selector);
687 static selectors_t selectors;
688
689 static int parse_field_num(char *str)
690 {
691         int f = 0;
692
693         while (*str) {
694                 if (*str < '0' || *str > '9')
695                         return -1;
696                 if (f >= 100000000)
697                         return -1;
698                 f = 10*f + *str - '0';
699                 str++;
700         }
701         return f;
702 }
703
704 static int parse_field(char *str)
705 {
706         if (!*str)
707                 return 0;
708
709         int f = parse_field_num(str);
710         if (f > 0)
711                 return f;
712
713         if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0)
714                 return f;
715
716         die("Unknown field `%s'", str);
717 }
718
719 static char *parse_selector(char *str)
720 {
721         char buf[strlen(str) + 1];
722         strcpy(buf, str);
723
724         struct selector *s = selectors_push(&selectors);
725         char *sep = strchr(buf, '-');
726         if (sep) {
727                 *sep++ = 0;
728                 s->first_field = parse_field(buf);
729                 s->last_field = parse_field(sep);
730         } else
731                 s->first_field = s->last_field = parse_field(buf);
732
733         return NULL;
734 }
735
736 static void finish_parse_selectors(void)
737 {
738         if (!selectors_count(&selectors))
739                 parse_selector("-");
740 }
741
742 static void select_fields(void)
743 {
744         for (int i = 0; i < selectors_count(&selectors); i++) {
745                 struct selector *s = selectors_nth(&selectors, i);
746                 int first = s->first_field;
747                 if (first <= 0)
748                         first = 1;
749                 int last = s->last_field;
750                 if (last <= 0)
751                         last = fields_count(&in_fields);
752                 for (int j = first; j <= last; j++) {
753                         struct field *f = fields_push(&out_fields);
754                         if (j >= 1 && j <= fields_count(&in_fields))
755                                 *f = *fields_nth(&in_fields, j-1);
756                         else
757                                 f->start_pos = f->len = 0;
758                 }
759         }
760 }
761
762 static void select_all_fields(void)
763 {
764         for (int i = 0; i < fields_count(&in_fields); i++)
765                 *fields_push(&out_fields) = *fields_nth(&in_fields, i);
766 }
767
768 /*** Processing of files ***/
769
770 static void one_pass(int pass)
771 {
772         if (pass & 2)
773                 write_header();
774
775         for (;;) {
776                 line_number++;
777                 if (!read_line())
778                         break;
779
780                 if (want_trim && (pass & 1))
781                         trim_fields();
782
783                 fields_reset(&out_fields);
784                 if (pass & 1)
785                         select_fields();
786                 else
787                         select_all_fields();
788
789                 if (want_equalize && (pass & 2))
790                         equalize_fields();
791                 update_stats();
792                 write_line();
793         }
794
795         if (pass & 2)
796                 write_footer();
797 }
798
799 static void two_pass(void)
800 {
801         struct format *final_format = out_format;
802
803         // We need to use character set info from the current locale
804         setlocale(LC_CTYPE, "");
805
806         // Pass 1: Set up writer of intermediate format
807         out_format = xmalloc_zero(sizeof(*out_format));
808         out_format->id = FORM_TMP;
809         out_format->read_line = tmp_read;
810         out_format->write_line = tmp_write;
811         out_format->tmp_file = tmpfile();
812         out_format->field_names = in_format->field_names;
813         one_pass(1);
814
815         // Pass 2: Set up reader of intermediate format
816         in_format = out_format;
817         rewind(in_format->tmp_file);
818         line_number = 0;
819         out_format = final_format;
820         want_stats = 0;
821         one_pass(2);
822         fclose(in_format->tmp_file);
823 }
824
825 /*** Parsing of arguments ***/
826
827 static void NONRET usage(void)
828 {
829         printf("\
830 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
831 \n\
832 Formats:\n\
833 -t, --tsv               Tab-separated values (default)\n\
834 -c, --csv               Comma-separated values\n\
835 -w, --ws                Values separated by arbitrary whitespace\n\
836 -r, --regex=<rx>        Separator given by Perl regular expression (input only)\n\
837     --table             Format a table (output only)\n\
838 \n\
839 Format parameters:\n\
840 -d, --fs=<char>         Delimiter of fields\n\
841 -f, --fields=<f>,...    Set field names\n\
842 -h, --header            The first line contains field names\n\
843 -q, --quiet             Do not show warnings\n\
844     --always-quote      Put quotes around all fields (CSV output only)\n\
845     --table-sep=<n>     Separate table columns by <n> spaces (default: 2)\n\
846     --grid              Separate table columns by grid lines\n\
847 -s, --sloppy            Ignore separators at the start/end of line (ws/regex only)\n\
848 \n\
849 Other options:\n\
850     --trim              Trim leading and trailing whitespaces in fields\n\
851     --equalize          Pad all lines to the maximum number of fields\n\
852 ");
853         exit(0);
854 }
855
856 static void NONRET bad_args(const char *msg, ...)
857 {
858         if (msg) {
859                 va_list args;
860                 va_start(args, msg);
861                 fprintf(stderr, "xsv: ");
862                 vfprintf(stderr, msg, args);
863                 fputc('\n', stderr);
864                 va_end(args);
865         }
866         fprintf(stderr, "Try `xsv --help' for more information.\n");
867         exit(1);
868 }
869
870 static const char short_options[] = "cd:f:hqr:twW";
871
872 enum long_options {
873         OPT_HELP = 256,
874         OPT_VERSION,
875         OPT_TRIM,
876         OPT_ALWAYS_QUOTE,
877         OPT_TABLE,
878         OPT_TABLE_SEP,
879         OPT_GRID,
880         OPT_EQUALIZE,
881 };
882
883 static const struct option long_options[] = {
884         { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
885         { "csv",                0,      NULL,   'c' },
886         { "equalize",           0,      NULL,   OPT_EQUALIZE },
887         { "fields",             1,      NULL,   'f' },
888         { "fs",                 1,      NULL,   'd' },
889         { "grid",               0,      NULL,   OPT_GRID },
890         { "header",             0,      NULL,   'h' },
891         { "help",               0,      NULL,   OPT_HELP },
892         { "quiet",              0,      NULL,   'q' },
893         { "regex",              1,      NULL,   'r' },
894         { "sloppy",             0,      NULL,   's' },
895         { "table",              0,      NULL,   OPT_TABLE },
896         { "table-sep",          1,      NULL,   OPT_TABLE_SEP },
897         { "trim",               0,      NULL,   OPT_TRIM },
898         { "tsv",                0,      NULL,   't' },
899         { "version",            0,      NULL,   OPT_VERSION },
900         { "ws",                 0,      NULL,   'w' },
901         { NULL,                 0,      NULL,   0 },
902 };
903
904 static void set_format(int format_id)
905 {
906         struct format *f = xmalloc_zero(sizeof(*f));
907         f->id = format_id;
908
909         switch (format_id) {
910                 case FORM_TSV:
911                         f->fs = '\t';
912                         f->quote = -1;
913                         f->read_line = csv_read;
914                         f->write_line = csv_write;
915                         break;
916                 case FORM_CSV:
917                         f->fs = ',';
918                         f->quote = '"';
919                         f->read_line = csv_read;
920                         f->write_line = csv_write;
921                         break;
922                 case FORM_WS:
923                         f->fs = ' ';
924                         f->quote = -1;
925                         f->read_line = ws_read;
926                         f->write_line = csv_write;
927                         break;
928                 case FORM_REGEX:
929                         f->read_line = regex_read;
930                         break;
931                 case FORM_TABLE:
932                         f->write_line = table_write;
933                         f->write_grid = table_write_grid;
934                         f->needs_stats = 1;
935                         f->table_sep = 2;
936                         break;
937         }
938
939         if (!in_format)
940                 in_format = f;
941         else if (!out_format)
942                 out_format = f;
943         else
944                 bad_args("At most two formats may be given.");
945 }
946
947 static struct format *current_format(void)
948 {
949         if (out_format)
950                 return out_format;
951         if (in_format)
952                 return in_format;
953         set_format(FORM_TSV);
954         return in_format;
955 }
956
957 int main(int argc, char **argv)
958 {
959         int opt;
960         const char *err;
961
962         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
963                 switch (opt) {
964                         case 'c':
965                                 set_format(FORM_CSV);
966                                 break;
967                         case 'd':
968                                 if (optarg[0])
969                                         current_format()->fs = optarg[0];
970                                 else
971                                         bad_args("No field delimiter given.");
972                                 break;
973                         case 'f':
974                                 current_format()->set_field_names = optarg;
975                                 break;
976                         case 'h':
977                                 current_format()->has_header = 1;
978                                 break;
979                         case 'q':
980                                 current_format()->quiet = 1;
981                                 break;
982                         case 'r':
983                                 set_format(FORM_REGEX);
984                                 err = regex_set(current_format(), optarg);
985                                 if (err)
986                                         bad_args("Error compiling regex: %s", err);
987                                 break;
988                         case 's':
989                                 if (current_format()->id != FORM_WS && current_format()->id != FORM_REGEX)
990                                         bad_args("--sloppy makes sense only for --ws or --regex.");
991                                 current_format()->sloppy = 1;
992                                 break;
993                         case 't':
994                                 set_format(FORM_TSV);
995                                 break;
996                         case 'w':
997                                 set_format(FORM_WS);
998                                 break;
999                         case OPT_ALWAYS_QUOTE:
1000                                 if (current_format()->id != FORM_CSV)
1001                                         bad_args("--always-quote makes sense only for --csv.");
1002                                 current_format()->always_quote = 1;
1003                                 break;
1004                         case OPT_HELP:
1005                                 usage();
1006                         case OPT_VERSION:
1007                                 puts("This is xsv version " VERSION ".");
1008                                 exit(0);
1009                         case OPT_TRIM:
1010                                 want_trim = 1;
1011                                 break;
1012                         case OPT_TABLE:
1013                                 set_format(FORM_TABLE);
1014                                 break;
1015                         case OPT_TABLE_SEP:
1016                                 current_format()->table_sep = atoi(optarg);
1017                                 break;
1018                         case OPT_GRID:
1019                                 current_format()->table_grid = 1;
1020                                 break;
1021                         case OPT_EQUALIZE:
1022                                 want_equalize = 1;
1023                                 break;
1024                         default:
1025                                 bad_args(NULL);
1026                 }
1027
1028         current_format();
1029         if (!out_format)
1030                 out_format = in_format;
1031         if (!in_format->read_line)
1032                 bad_args("Write-only format selected for input.");
1033         if (!out_format->write_line)
1034                 bad_args("Read-only format selected for output.");
1035         read_header();
1036
1037         for (int i = optind; i < argc; i++) {
1038                 err = parse_selector(argv[i]);
1039                 if (err)
1040                         bad_args(err);
1041         }
1042         finish_parse_selectors();
1043
1044         want_stats = out_format->needs_stats | want_equalize;
1045         if (want_stats)
1046                 two_pass();
1047         else
1048                 one_pass(3);
1049         return 0;
1050 }