]> mj.ucw.cz Git - xsv.git/blob - xsv.c
--strict-ws is gone, added a more general --sloppy switch
[xsv.git] / xsv.c
1 /*
2  *      A Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #define _GNU_SOURCE
8
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <stdarg.h>
13 #include <getopt.h>
14 #include <wchar.h>
15 #include <locale.h>
16
17 #include <pcre.h>
18
19 #ifdef __GNUC__
20 #define NONRET __attribute__((noreturn))
21 #define UNUSED __attribute__((unused))
22 #else
23 #define NONRET
24 #define UNUSED
25 #endif
26
27 /*** General functions ***/
28
29 static void NONRET die(char *msg, ...)
30 {
31         va_list args;
32         va_start(args, msg);
33         fprintf(stderr, "xsv: ");
34         vfprintf(stderr, msg, args);
35         fputc('\n', stderr);
36         va_end(args);
37         exit(1);
38 }
39
40 /*** Memory allocation ***/
41
42 static void *xmalloc(size_t bytes)
43 {
44         void *p = malloc(bytes);
45         if (!p)
46                 die("Out of memory (cannot allocate %zu bytes)", bytes);
47         return p;
48 }
49
50 static void *xmalloc_zero(size_t bytes)
51 {
52         void *p = xmalloc(bytes);
53         memset(p, 0, bytes);
54         return p;
55 }
56
57 static void *xrealloc(void *old, size_t bytes)
58 {
59         void *p = realloc(old, bytes);
60         if (!p)
61                 die("Out of memory (cannot allocate %zu bytes)", bytes);
62         return p;
63 }
64
65 #define DECLARE_BUF(name, type) \
66         typedef struct { type *start; int count; int max; } name##_t;                           \
67         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
68         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
69         static inline int name##_count(name##_t *b) { return b->count; }                        \
70         static void name##_extend(name##_t *b) {                                                \
71                 b->max = b->max ? 2*b->max : 16;                                                \
72                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
73         }                                                                                       \
74         static inline type *name##_push(name##_t *b) {                                          \
75                 if (b->count >= b->max) name##_extend(b);                                       \
76                 return &b->start[b->count++];                                                   \
77         }                                                                                       \
78         static inline type *name##_first(name##_t *b) { return b->start; }                      \
79         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
80         // end
81
82 DECLARE_BUF(intarray, int);
83 DECLARE_BUF(stringarray, char *);
84
85 /*** Formats and their parameters ***/
86
87 enum format_id {
88         FORM_UNSPEC,
89         FORM_TSV,
90         FORM_CSV,
91         FORM_WS,
92         FORM_REGEX,
93         FORM_TMP,
94         FORM_TABLE,
95 };
96
97 struct format {
98         enum format_id id;
99         int fs;
100         int quote;
101         int quiet;
102         int sloppy;
103         int (*read_line)(struct format *fmt);
104         void (*write_line)(struct format *fmt);
105         void (*write_grid)(struct format *fmt, int pos);        // -1=above, 1=below, 0=after header
106         int needs_stats;
107
108         // Field names
109         int has_header;
110         char *set_field_names;
111         struct field_names *field_names;
112
113         // CSV backend:
114         int always_quote;
115
116         // regex backend:
117         pcre *pcre;
118         pcre_extra *pcre_extra;
119
120         // Temporary file backend:
121         FILE *tmp_file;
122
123         // Table backend:
124         int table_sep;
125         int table_grid;
126 };
127
128 static struct format *in_format, *out_format;
129 static int want_trim, want_equalize, want_stats;
130
131 struct field {
132         int start_pos;
133         int len;
134 };
135
136 DECLARE_BUF(fields, struct field);
137 DECLARE_BUF(line, unsigned char);
138
139 static fields_t in_fields, out_fields;
140 static struct field *in_field;
141 static line_t in_line;
142 static int line_number;
143
144 static int read_line(void)
145 {
146         fields_reset(&in_fields);
147         line_reset(&in_line);
148         in_field = NULL;
149         if (!in_format->read_line(in_format))
150                 return 0;
151         if (ferror_unlocked(stdin))
152                 die("I/O error when reading standard input");
153         return 1;
154 }
155
156 static void write_line(void)
157 {
158         out_format->write_line(out_format);
159         if (ferror_unlocked(stdout))
160                 die("I/O error when writing standard input");
161 }
162
163 static void write_grid(int pos)
164 {
165         if (out_format->write_grid) {
166                 out_format->write_grid(out_format, pos);
167                 if (ferror_unlocked(stdout))
168                         die("I/O error when writing standard input");
169         }
170 }
171
172 static void new_field(int pos)
173 {
174         in_field = fields_push(&in_fields);
175         in_field->start_pos = pos;
176         in_field->len = 0;
177 }
178
179 static void ensure_field(int pos)
180 {
181         if (!in_field)
182                 new_field(pos);
183 }
184
185 static unsigned char *get_field(fields_t *fields, int i, int *len)
186 {
187         struct field *f = fields_nth(fields, i);
188         *len = f->len;
189         return line_nth(&in_line, f->start_pos);
190 }
191
192 static void warn(struct format *fmt, char *msg, ...)
193 {
194         if (!fmt->quiet) {
195                 fprintf(stderr, "Warning at line %d: ", line_number);
196                 va_list args;
197                 va_start(args, msg);
198                 vfprintf(stderr, msg, args);
199                 va_end(args);
200                 fputc('\n', stderr);
201         }
202 }
203
204 static int next_line(void)
205 {
206         for (;;) {
207                 int c = getchar_unlocked();
208                 if (c == '\r')
209                         continue;
210                 if (c < 0)
211                         return !!line_count(&in_line);
212                 if (c == '\n')
213                         return 1;
214                 *line_push(&in_line) = c;
215         }
216 }
217
218 static int field_chars(struct field *f)
219 {
220         unsigned char *s = line_nth(&in_line, f->start_pos);
221         int i = 0;
222         mbstate_t mbs;
223         memset(&mbs, 0, sizeof(mbs));
224
225         int chars = 0;
226         while (i < f->len) {
227                 size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
228                 if ((int) k <= 0)
229                         break;
230                 i += k;
231                 chars++;
232         }
233
234         return chars;
235 }
236
237 /*** Field statistics ***/
238
239 static intarray_t column_widths;
240
241 static void update_stats(void)
242 {
243         if (!want_stats)
244                 return;
245
246         for (int i = 0; i < fields_count(&out_fields); i++) {
247                 struct field *f = fields_nth(&out_fields, i);
248                 intarray_t *w = &column_widths;
249
250                 while (i >= intarray_count(w))
251                         *intarray_push(w) = 0;
252                 int fw = field_chars(f);
253                 if (*intarray_nth(w, i) < fw)
254                         *intarray_nth(w, i) = fw;
255         }
256 }
257
258 /*** CSV/TSV back-end */
259
260 static int csv_read(struct format *fmt)
261 {
262         int quoted = 0;
263         for (;;) {
264                 int c = getchar_unlocked();
265                 int i = line_count(&in_line);
266 restart:
267                 if (c == '\r')
268                         continue;
269                 if (c < 0 || c == '\n') {
270                         if (quoted)
271                                 warn(fmt, "Missing closing quote.");
272                         if (c < 0)
273                                 return !!fields_count(&in_fields);
274                         else
275                                 return 1;
276                 }
277                 if (quoted) {
278                         if (c == fmt->quote) {
279                                 c = getchar_unlocked();
280                                 if (c != fmt->quote) {
281                                         quoted = 0;
282                                         goto restart;
283                                 }
284                                 // Two quotes assimilate to one
285                         }
286                         // Fall through to pushing the character
287                 } else if (c == fmt->quote) {
288                         quoted = 1;
289                         continue;
290                 } else if (c == fmt->fs && !quoted) {
291                         ensure_field(i);
292                         new_field(i);
293                         continue;
294                 }
295                 ensure_field(i);
296                 *line_push(&in_line) = c;
297                 in_field->len++;
298         }
299 }
300
301 static int is_ws(int c)
302 {
303         return (c == ' ' || c == '\t' || c == '\f');
304 }
305
306 static void csv_write(struct format *fmt)
307 {
308         for (int i=0; i < fields_count(&out_fields); i++) {
309                 int len;
310                 unsigned char *p = get_field(&out_fields, i, &len);
311
312                 int need_quotes = 0;
313                 if (fmt->quote >= 0) {
314                         need_quotes = fmt->always_quote;
315                         for (int j=0; !need_quotes && j < len; j++) {
316                                 if (p[j] == fmt->fs || p[j] == fmt->quote)
317                                         need_quotes = 1;
318                         }
319                 }
320                 if (i)
321                         putchar_unlocked(fmt->fs);
322                 if (need_quotes)
323                         putchar_unlocked(fmt->quote);
324                 for (int j=0; j < len; j++) {
325                         int c = p[j];
326                         if (c == fmt->fs && !need_quotes)
327                                 warn(fmt, "Field separator found inside field and quoting is turned off.");
328                         if (c == fmt->quote)
329                                 putchar_unlocked(c);
330                         putchar_unlocked(c);
331                 }
332                 if (need_quotes)
333                         putchar_unlocked(fmt->quote);
334         }
335         putchar_unlocked('\n');
336 }
337
338 /*** White-space back-end ***/
339
340 static int ws_read(struct format *fmt)
341 {
342         if (!next_line())
343                 return 0;
344
345         unsigned char *line = line_first(&in_line);
346         int n = line_count(&in_line);
347         if (!n)
348                 return 1;
349
350         int ws = 0;
351         new_field(0);
352         for (int i=0; i<n; i++) {
353                 int c = line[i];
354                 if (is_ws(c)) {
355                         ws++;
356                 } else {
357                         if (ws) {
358                                 if (!in_field->start_pos &&
359                                     !in_field->len &&
360                                     fmt->sloppy)
361                                         in_field->start_pos = i;
362                                 else
363                                         new_field(i);
364                                 ws = 0;
365                         }
366                         in_field->len++;
367                 }
368         }
369
370         if (ws && !fmt->sloppy)
371                 new_field(n);
372         return 1;
373 }
374
375 /*** Regex back-end ***/
376
377 static const char *regex_set(struct format *f, char *rx)
378 {
379         const char *err;
380         int errpos;
381         f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
382         if (!f->pcre)
383                 return err;
384
385         f->pcre_extra = pcre_study(f->pcre, 0, &err);
386         if (!f->pcre_extra)
387                 return err;
388
389         return NULL;
390 }
391
392 static int regex_read(struct format *fmt)
393 {
394         if (!next_line())
395                 return 0;
396
397         unsigned char *c = line_first(&in_line);
398         int n = line_count(&in_line);
399         if (!n)
400                 return 1;
401
402         int i = 0;
403         for (;;) {
404                 int ovec[3];
405                 int err = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
406                 if (err < 0) {
407                         if (err != PCRE_ERROR_NOMATCH)
408                                 warn(fmt, "PCRE matching error %d", err);
409                         // No further occurrence of the separator: the rest is a single field
410                         if (!fmt->sloppy || i < n) {
411                                 new_field(i);
412                                 in_field->len = n - i;
413                         }
414                         return 1;
415                 }
416                 if (!fmt->sloppy || ovec[0]) {
417                         new_field(i);
418                         in_field->len = ovec[0] - i;
419                 }
420                 i = ovec[1];
421         }
422 }
423
424 /*** Table back-end ***/
425
426 static void table_write(struct format *fmt)
427 {
428         for (int i = 0; i < intarray_count(&column_widths); i++) {
429                 if (fmt->table_grid) {
430                         putchar_unlocked('|');
431                         printf("%*s", fmt->table_sep / 2, "");
432                 } else if (i)
433                         printf("%*s", fmt->table_sep, "");
434
435                 int cw = *intarray_nth(&column_widths, i);
436                 int fw = 0;
437                 if (i < fields_count(&out_fields)) {
438                         int len;
439                         unsigned char *p = get_field(&out_fields, i, &len);
440                         fw = field_chars(fields_nth(&out_fields, i));
441                         if (fw > cw) {
442                                 warn(fmt, "Internal error: Wrongly calculated width of column %d (%d > %d)", i, fw, cw);
443                                 cw = fw;
444                         }
445                         while (len--)
446                                 putchar(*p++);
447                 }
448                 while (fw < cw) {
449                         putchar_unlocked(' ');
450                         fw++;
451                 }
452
453                 if (fmt->table_grid)
454                         printf("%*s", fmt->table_sep - fmt->table_sep / 2, "");
455         }
456
457         if (fmt->table_grid)
458                 putchar_unlocked('|');
459         putchar_unlocked('\n');
460 }
461
462 static void table_write_grid(struct format *fmt, int pos UNUSED)
463 {
464         if (!fmt->table_grid)
465                 return;
466
467         for (int i = 0; i < intarray_count(&column_widths); i++) {
468                 putchar_unlocked('+');
469                 int w = fmt->table_sep + *intarray_nth(&column_widths, i);
470                 while (w--)
471                         putchar('-');
472         }
473         putchar_unlocked('+');
474         putchar_unlocked('\n');
475 }
476
477 /*** Temporary file back-end ***/
478
479 static int tmp_read(struct format *fmt)
480 {
481         FILE *tf = fmt->tmp_file;
482
483         for (;;) {
484                 int c = getc_unlocked(tf);
485                 if (c < 0)
486                         return 0;
487                 if (c == 0xff)
488                         return 1;
489                 if (c == 0xfe) {
490                         c = getc_unlocked(tf);
491                         c = (c << 8) | getc_unlocked(tf);
492                         c = (c << 8) | getc_unlocked(tf);
493                         c = (c << 8) | getc_unlocked(tf);
494                 }
495                 new_field(line_count(&in_line));
496                 in_field->len = c;
497                 while (c--) {
498                         int x = getc_unlocked(tf);
499                         if (x < 0)
500                                 die("Truncated temporary file");
501                         *line_push(&in_line) = x;
502                 }
503         }
504
505         if (ferror_unlocked(tf))
506                 die("I/O error when reading temporary file");
507 }
508
509 static void tmp_write(struct format *fmt)
510 {
511         FILE *tf = fmt->tmp_file;
512
513         for (int i = 0; i < fields_count(&out_fields); i++) {
514                 int len;
515                 unsigned char *p = get_field(&out_fields, i, &len);
516
517                 if (len < 0xfe)
518                         putc_unlocked(len, tf);
519                 else {
520                         putc_unlocked(0xfe, tf);
521                         putc_unlocked((len >> 24) & 0xff, tf);
522                         putc_unlocked((len >> 16) & 0xff, tf);
523                         putc_unlocked((len >> 8) & 0xff, tf);
524                         putc_unlocked(len & 0xff, tf);
525                 }
526
527                 while (len--)
528                         putc_unlocked(*p++, tf);
529         }
530         putc_unlocked(0xff, tf);
531
532         if (ferror_unlocked(tf))
533                 die("I/O error when writing temporary file");
534 }
535
536 /*** Transforms ***/
537
538 static void trim_fields(void)
539 {
540         unsigned char *line = line_first(&in_line);
541         for (int i = 0; i < fields_count(&in_fields); i++) {
542                 struct field *f = fields_nth(&in_fields, i);
543                 while (f->len && is_ws(line[f->start_pos]))
544                         f->start_pos++, f->len--;
545                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
546                         f->len--;
547         }
548 }
549
550 static void equalize_fields(void)
551 {
552         while (fields_count(&out_fields) < intarray_count(&column_widths)) {
553                 struct field *f = fields_push(&out_fields);
554                 f->start_pos = f->len = 0;
555         }
556 }
557
558 /*** Field names and headers ***/
559
560 struct field_names {
561         stringarray_t names;
562 };
563
564 static void add_field(struct field_names *fn, char *name, int namelen)
565 {
566         char *n = xmalloc(namelen + 1);
567         memcpy(n, name, namelen);
568         n[namelen] = 0;
569         *stringarray_push(&fn->names) = n;
570 }
571
572 static void add_field_names(struct field_names *fn, char *names)
573 {
574         char *p = names;
575         while (p) {
576                 char *q = strchr(p, ',');
577                 int len = q ? q-p : (int) strlen(p);
578                 add_field(fn, p, len);
579                 p = q ? q+1 : NULL;
580         }
581 }
582
583 static void read_header(void)
584 {
585         if (!(in_format->has_header || in_format->set_field_names))
586                 return;
587
588         struct field_names *fn = xmalloc_zero(sizeof(*fn));
589         in_format->field_names = fn;
590
591         if (in_format->has_header) {
592                 if (!read_line())
593                         die("Missing input header");
594         }
595
596         if (in_format->set_field_names) {
597                 add_field_names(fn, in_format->set_field_names);
598         } else {
599                 for (int i = 0; i < fields_count(&in_fields); i++) {
600                         int len;
601                         char *s = (char *) get_field(&in_fields, i, &len);
602                         add_field(fn, s, len);
603                 }
604         }
605 }
606
607 static void write_header(void)
608 {
609         if (!out_format->has_header) {
610                 write_grid(-1);
611                 return;
612         }
613
614         if (out_format->set_field_names) {
615                 struct field_names *fn = xmalloc_zero(sizeof(*fn));
616                 out_format->field_names = fn;
617                 add_field_names(fn, out_format->set_field_names);
618         } else if (in_format->field_names)
619                 out_format->field_names = in_format->field_names;
620         else
621                 die("Output header requested, but no field names specified");
622
623         line_reset(&in_line);
624         fields_reset(&out_fields);
625         struct field_names *fn = out_format->field_names;
626         for (int i = 0; i < stringarray_count(&fn->names); i++) {
627                 struct field *f = fields_push(&out_fields);
628                 f->start_pos = line_count(&in_line);
629                 f->len = 0;
630                 char *s = *stringarray_nth(&fn->names, i);
631                 while (*s) {
632                         *line_push(&in_line) = *s++;
633                         f->len++;
634                 }
635         }
636
637         // This is tricky: when we are formatting a table, field names are normally
638         // calculated in pass 1, but the header is written in pass 2, so we have to
639         // update column statistics, because field name can be too wide to fit.
640         want_stats++;
641         update_stats();
642         want_stats--;
643         if (want_equalize)
644                 equalize_fields();
645         write_grid(-1);
646         write_line();
647         write_grid(0);
648 }
649
650 static void write_footer(void)
651 {
652         write_grid(1);
653 }
654
655 static int find_field_by_name(struct field_names *fn, char *name)
656 {
657         for (int i = 0; i < stringarray_count(&fn->names); i++)
658                 if (!strcmp(*stringarray_nth(&fn->names, i), name))
659                         return i + 1;
660         return -1;
661 }
662
663 /*** Field selection ***/
664
665 struct selector {
666         int first_field, last_field;            // 0 means "boundary"
667 };
668
669 DECLARE_BUF(selectors, struct selector);
670 static selectors_t selectors;
671
672 static int parse_field_num(char *str)
673 {
674         int f = 0;
675
676         while (*str) {
677                 if (*str < '0' || *str > '9')
678                         return -1;
679                 if (f >= 100000000)
680                         return -1;
681                 f = 10*f + *str - '0';
682                 str++;
683         }
684         return f;
685 }
686
687 static int parse_field(char *str)
688 {
689         if (!*str)
690                 return 0;
691
692         int f = parse_field_num(str);
693         if (f > 0)
694                 return f;
695
696         if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0)
697                 return f;
698
699         die("Unknown field %s", str);
700 }
701
702 static char *parse_selector(char *str)
703 {
704         char buf[strlen(str) + 1];
705         strcpy(buf, str);
706
707         struct selector *s = selectors_push(&selectors);
708         char *sep = strchr(buf, '-');
709         if (sep) {
710                 *sep++ = 0;
711                 s->first_field = parse_field(buf);
712                 s->last_field = parse_field(sep);
713         } else
714                 s->first_field = s->last_field = parse_field(buf);
715
716         return NULL;
717 }
718
719 static void finish_parse_selectors(void)
720 {
721         if (!selectors_count(&selectors))
722                 parse_selector("-");
723 }
724
725 static void select_fields(void)
726 {
727         for (int i = 0; i < selectors_count(&selectors); i++) {
728                 struct selector *s = selectors_nth(&selectors, i);
729                 int first = s->first_field;
730                 if (first <= 0)
731                         first = 1;
732                 int last = s->last_field;
733                 if (last <= 0)
734                         last = fields_count(&in_fields);
735                 for (int j = first; j <= last; j++) {
736                         struct field *f = fields_push(&out_fields);
737                         if (j >= 1 && j <= fields_count(&in_fields))
738                                 *f = *fields_nth(&in_fields, j-1);
739                         else
740                                 f->start_pos = f->len = 0;
741                 }
742         }
743 }
744
745 static void select_all_fields(void)
746 {
747         for (int i = 0; i < fields_count(&in_fields); i++)
748                 *fields_push(&out_fields) = *fields_nth(&in_fields, i);
749 }
750
751 /*** Processing of files ***/
752
753 static void one_pass(int pass)
754 {
755         if (pass & 2)
756                 write_header();
757
758         for (;;) {
759                 line_number++;
760                 if (!read_line())
761                         break;
762
763                 if (want_trim && (pass & 1))
764                         trim_fields();
765
766                 fields_reset(&out_fields);
767                 if (pass & 1)
768                         select_fields();
769                 else
770                         select_all_fields();
771
772                 if (want_equalize && (pass & 2))
773                         equalize_fields();
774                 update_stats();
775                 write_line();
776         }
777
778         if (pass & 2)
779                 write_footer();
780 }
781
782 static void two_pass(void)
783 {
784         struct format *final_format = out_format;
785
786         // We need to use character set info from the current locale
787         setlocale(LC_CTYPE, "");
788
789         // Pass 1: Set up writer of intermediate format
790         out_format = xmalloc_zero(sizeof(*out_format));
791         out_format->id = FORM_TMP;
792         out_format->read_line = tmp_read;
793         out_format->write_line = tmp_write;
794         out_format->tmp_file = tmpfile();
795         out_format->field_names = in_format->field_names;
796         one_pass(1);
797
798         // Pass 2: Set up reader of intermediate format
799         in_format = out_format;
800         rewind(in_format->tmp_file);
801         line_number = 0;
802         out_format = final_format;
803         want_stats = 0;
804         one_pass(2);
805         fclose(in_format->tmp_file);
806 }
807
808 /*** Parsing of arguments ***/
809
810 static void NONRET usage(void)
811 {
812         printf("\
813 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
814 \n\
815 Formats:\n\
816 -t, --tsv               TAB-separated values (default)\n\
817 -c, --csv               Comma-separated values\n\
818 -w, --ws                Values separated by arbitrary whitespace\n\
819 -r, --regex=<rx>        Separator given by Perl regular expression (input only)\n\
820     --table             Format a table (output only)\n\
821 \n\
822 Format parameters:\n\
823 -d, --fs=<char>         Delimiter of fields\n\
824 -f, --fields=<f>,...    Set field names\n\
825 -h, --header            The first line contains field names\n\
826 -q, --quiet             Do not show warnings\n\
827     --always-quote      Put quotes around all fields (CSV output only)\n\
828     --table-sep=<n>     Separate table columns by <n> spaces (default: 2)\n\
829     --grid              Separate table columns by grid lines\n\
830 -s, --sloppy            Ignore separators at the start/end of line (ws/regex only)\n\
831 \n\
832 Other options:\n\
833     --trim              Trim leading and trailing whitespaces in fields\n\
834     --equalize          Pad all lines to the maximum number of fields\n\
835 ");
836         exit(0);
837 }
838
839 static void NONRET bad_args(const char *msg, ...)
840 {
841         if (msg) {
842                 va_list args;
843                 va_start(args, msg);
844                 fprintf(stderr, "xsv: ");
845                 vfprintf(stderr, msg, args);
846                 fputc('\n', stderr);
847                 va_end(args);
848         }
849         fprintf(stderr, "Try `xsv --help' for more information.\n");
850         exit(1);
851 }
852
853 static const char short_options[] = "cd:f:hqr:twW";
854
855 enum long_options {
856         OPT_HELP = 256,
857         OPT_TRIM,
858         OPT_ALWAYS_QUOTE,
859         OPT_TABLE,
860         OPT_TABLE_SEP,
861         OPT_GRID,
862         OPT_EQUALIZE,
863 };
864
865 static const struct option long_options[] = {
866         { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
867         { "csv",                0,      NULL,   'c' },
868         { "equalize",           0,      NULL,   OPT_EQUALIZE },
869         { "fields",             1,      NULL,   'f' },
870         { "fs",                 1,      NULL,   'd' },
871         { "grid",               0,      NULL,   OPT_GRID },
872         { "header",             0,      NULL,   'h' },
873         { "quiet",              0,      NULL,   'q' },
874         { "regex",              1,      NULL,   'r' },
875         { "sloppy",             0,      NULL,   's' },
876         { "table",              0,      NULL,   OPT_TABLE },
877         { "table-sep",          1,      NULL,   OPT_TABLE_SEP },
878         { "trim",               0,      NULL,   OPT_TRIM },
879         { "tsv",                0,      NULL,   't' },
880         { "ws",                 0,      NULL,   'w' },
881         { "help",               0,      NULL,   OPT_HELP },
882         { NULL,                 0,      NULL,   0 },
883 };
884
885 static void set_format(int format_id)
886 {
887         struct format *f = xmalloc_zero(sizeof(*f));
888         f->id = format_id;
889
890         switch (format_id) {
891                 case FORM_TSV:
892                         f->fs = '\t';
893                         f->quote = -1;
894                         f->read_line = csv_read;
895                         f->write_line = csv_write;
896                         break;
897                 case FORM_CSV:
898                         f->fs = ',';
899                         f->quote = '"';
900                         f->read_line = csv_read;
901                         f->write_line = csv_write;
902                         break;
903                 case FORM_WS:
904                         f->fs = ' ';
905                         f->quote = -1;
906                         f->read_line = ws_read;
907                         f->write_line = csv_write;
908                         break;
909                 case FORM_REGEX:
910                         f->read_line = regex_read;
911                         break;
912                 case FORM_TABLE:
913                         f->write_line = table_write;
914                         f->write_grid = table_write_grid;
915                         f->needs_stats = 1;
916                         f->table_sep = 2;
917                         break;
918         }
919
920         if (!in_format)
921                 in_format = f;
922         else if (!out_format)
923                 out_format = f;
924         else
925                 bad_args("At most two formats may be given.");
926 }
927
928 static struct format *current_format(void)
929 {
930         if (out_format)
931                 return out_format;
932         if (in_format)
933                 return in_format;
934         set_format(FORM_TSV);
935         return in_format;
936 }
937
938 int main(int argc, char **argv)
939 {
940         int opt;
941         const char *err;
942
943         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
944                 switch (opt) {
945                         case 'c':
946                                 set_format(FORM_CSV);
947                                 break;
948                         case 'd':
949                                 if (optarg[0])
950                                         current_format()->fs = optarg[0];
951                                 else
952                                         bad_args("No field delimiter given.");
953                                 break;
954                         case 'f':
955                                 current_format()->set_field_names = optarg;
956                                 break;
957                         case 'h':
958                                 current_format()->has_header = 1;
959                                 break;
960                         case 'q':
961                                 current_format()->quiet = 1;
962                                 break;
963                         case 'r':
964                                 set_format(FORM_REGEX);
965                                 err = regex_set(current_format(), optarg);
966                                 if (err)
967                                         bad_args("Error compiling regex: %s", err);
968                                 break;
969                         case 's':
970                                 if (current_format()->id != FORM_WS && current_format()->id != FORM_REGEX)
971                                         bad_args("--sloppy makes sense only for --ws or --regex.");
972                                 current_format()->sloppy = 1;
973                                 break;
974                         case 't':
975                                 set_format(FORM_TSV);
976                                 break;
977                         case 'w':
978                                 set_format(FORM_WS);
979                                 break;
980                         case OPT_ALWAYS_QUOTE:
981                                 if (current_format()->id != FORM_CSV)
982                                         bad_args("--always-quote makes sense only for --csv.");
983                                 current_format()->always_quote = 1;
984                                 break;
985                         case OPT_HELP:
986                                 usage();
987                         case OPT_TRIM:
988                                 want_trim = 1;
989                                 break;
990                         case OPT_TABLE:
991                                 set_format(FORM_TABLE);
992                                 break;
993                         case OPT_TABLE_SEP:
994                                 current_format()->table_sep = atoi(optarg);
995                                 break;
996                         case OPT_GRID:
997                                 current_format()->table_grid = 1;
998                                 break;
999                         case OPT_EQUALIZE:
1000                                 want_equalize = 1;
1001                                 break;
1002                         default:
1003                                 bad_args(NULL);
1004                 }
1005
1006         current_format();
1007         if (!out_format)
1008                 out_format = in_format;
1009         if (!in_format->read_line)
1010                 bad_args("Write-only format selected for input.");
1011         if (!out_format->write_line)
1012                 bad_args("Read-only format selected for output.");
1013         read_header();
1014
1015         for (int i = optind; i < argc; i++) {
1016                 err = parse_selector(argv[i]);
1017                 if (err)
1018                         bad_args(err);
1019         }
1020         finish_parse_selectors();
1021
1022         want_stats = out_format->needs_stats | want_equalize;
1023         if (want_stats)
1024                 two_pass();
1025         else
1026                 one_pass(3);
1027         return 0;
1028 }