]> mj.ucw.cz Git - xsv.git/blob - xsv.c
make release: Reorganized my directory structure
[xsv.git] / xsv.c
1 /*
2  *      The Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <stdarg.h>
11 #include <getopt.h>
12 #include <wchar.h>
13 #include <locale.h>
14
15 #include <pcre.h>
16
17 #ifdef __GNUC__
18 #define NONRET __attribute__((noreturn))
19 #define UNUSED __attribute__((unused))
20 #else
21 #define NONRET
22 #define UNUSED
23 #endif
24
25 static void select_fields(void);
26 static void select_all_fields(void);
27
28 /*** General functions ***/
29
30 static void NONRET die(char *msg, ...)
31 {
32         va_list args;
33         va_start(args, msg);
34         fprintf(stderr, "xsv: ");
35         vfprintf(stderr, msg, args);
36         fputc('\n', stderr);
37         va_end(args);
38         exit(1);
39 }
40
41 /*** Memory allocation ***/
42
43 static void *xmalloc(size_t bytes)
44 {
45         void *p = malloc(bytes);
46         if (!p)
47                 die("Out of memory (cannot allocate %zu bytes)", bytes);
48         return p;
49 }
50
51 static void *xmalloc_zero(size_t bytes)
52 {
53         void *p = xmalloc(bytes);
54         memset(p, 0, bytes);
55         return p;
56 }
57
58 static void *xrealloc(void *old, size_t bytes)
59 {
60         void *p = realloc(old, bytes);
61         if (!p)
62                 die("Out of memory (cannot allocate %zu bytes)", bytes);
63         return p;
64 }
65
66 #define DECLARE_BUF(name, type) \
67         typedef struct { type *start; int count; int max; } name##_t;                           \
68         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
69         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
70         static inline int name##_count(name##_t *b) { return b->count; }                        \
71         static void name##_extend(name##_t *b) {                                                \
72                 b->max = b->max ? 2*b->max : 16;                                                \
73                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
74         }                                                                                       \
75         static inline type *name##_push(name##_t *b) {                                          \
76                 if (b->count >= b->max) name##_extend(b);                                       \
77                 return &b->start[b->count++];                                                   \
78         }                                                                                       \
79         static inline type *name##_first(name##_t *b) { return b->start; }                      \
80         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
81         // end
82
83 DECLARE_BUF(intarray, int);
84 DECLARE_BUF(stringarray, char *);
85
86 /*** Formats and their parameters ***/
87
88 enum format_id {
89         FORM_UNSPEC,
90         FORM_TSV,
91         FORM_CSV,
92         FORM_WS,
93         FORM_REGEX,
94         FORM_TMP,
95         FORM_TABLE,
96 };
97
98 struct format {
99         enum format_id id;
100         int fs;
101         int quote;
102         int quiet;
103         int sloppy;
104         int (*read_line)(struct format *fmt);
105         void (*write_line)(struct format *fmt);
106         void (*write_grid)(struct format *fmt, int pos);        // -1=above, 1=below, 0=after header
107         int needs_stats;
108
109         // Field names
110         int has_header;
111         char *set_field_names;
112         struct field_names *field_names;
113
114         // CSV backend:
115         int always_quote;
116
117         // regex backend:
118         pcre *pcre;
119         pcre_extra *pcre_extra;
120
121         // Temporary file backend:
122         FILE *tmp_file;
123
124         // Table backend:
125         int table_sep;
126         int table_grid;
127 };
128
129 static struct format *in_format, *out_format;
130 static int want_trim, want_equalize, want_stats;
131
132 struct field {
133         int start_pos;
134         int len;
135 };
136
137 DECLARE_BUF(fields, struct field);
138 DECLARE_BUF(line, unsigned char);
139
140 static fields_t in_fields, out_fields;
141 static struct field *in_field;
142 static line_t in_line;
143 static int line_number;
144
145 static int read_line(void)
146 {
147         fields_reset(&in_fields);
148         line_reset(&in_line);
149         in_field = NULL;
150         if (!in_format->read_line(in_format))
151                 return 0;
152         if (ferror_unlocked(stdin))
153                 die("I/O error when reading standard input");
154         return 1;
155 }
156
157 static void write_line(void)
158 {
159         out_format->write_line(out_format);
160         if (ferror_unlocked(stdout))
161                 die("I/O error when writing standard input");
162 }
163
164 static void write_grid(int pos)
165 {
166         if (out_format->write_grid) {
167                 out_format->write_grid(out_format, pos);
168                 if (ferror_unlocked(stdout))
169                         die("I/O error when writing standard input");
170         }
171 }
172
173 static void new_field(int pos)
174 {
175         in_field = fields_push(&in_fields);
176         in_field->start_pos = pos;
177         in_field->len = 0;
178 }
179
180 static void ensure_field(int pos)
181 {
182         if (!in_field)
183                 new_field(pos);
184 }
185
186 static unsigned char *get_field(fields_t *fields, int i, int *len)
187 {
188         struct field *f = fields_nth(fields, i);
189         *len = f->len;
190         return line_nth(&in_line, f->start_pos);
191 }
192
193 static void warn(struct format *fmt, char *msg, ...)
194 {
195         if (!fmt->quiet) {
196                 fprintf(stderr, "Warning at line %d: ", line_number);
197                 va_list args;
198                 va_start(args, msg);
199                 vfprintf(stderr, msg, args);
200                 va_end(args);
201                 fputc('\n', stderr);
202         }
203 }
204
205 static int next_line(void)
206 {
207         for (;;) {
208                 int c = getchar_unlocked();
209                 if (c == '\r')
210                         continue;
211                 if (c < 0)
212                         return !!line_count(&in_line);
213                 if (c == '\n')
214                         return 1;
215                 *line_push(&in_line) = c;
216         }
217 }
218
219 static int field_chars(struct field *f)
220 {
221         unsigned char *s = line_nth(&in_line, f->start_pos);
222         int i = 0;
223         mbstate_t mbs;
224         memset(&mbs, 0, sizeof(mbs));
225
226         int chars = 0;
227         while (i < f->len) {
228                 size_t k = mbrlen((char *) s + i, f->len - i, &mbs);
229                 if ((int) k <= 0)
230                         break;
231                 i += k;
232                 chars++;
233         }
234
235         return chars;
236 }
237
238 /*** Field statistics ***/
239
240 static intarray_t column_widths;
241
242 static void update_stats(void)
243 {
244         if (!want_stats)
245                 return;
246
247         for (int i = 0; i < fields_count(&out_fields); i++) {
248                 struct field *f = fields_nth(&out_fields, i);
249                 intarray_t *w = &column_widths;
250
251                 while (i >= intarray_count(w))
252                         *intarray_push(w) = 0;
253                 int fw = field_chars(f);
254                 if (*intarray_nth(w, i) < fw)
255                         *intarray_nth(w, i) = fw;
256         }
257 }
258
259 /*** CSV/TSV back-end */
260
261 static int csv_read(struct format *fmt)
262 {
263         int quoted = 0;
264         for (;;) {
265                 int c = getchar_unlocked();
266                 int i = line_count(&in_line);
267 restart:
268                 if (c == '\r')
269                         continue;
270                 if (c < 0 || c == '\n') {
271                         if (quoted)
272                                 warn(fmt, "Missing closing quote.");
273                         if (c < 0)
274                                 return !!fields_count(&in_fields);
275                         else
276                                 return 1;
277                 }
278                 if (quoted) {
279                         if (c == fmt->quote) {
280                                 c = getchar_unlocked();
281                                 if (c != fmt->quote) {
282                                         quoted = 0;
283                                         goto restart;
284                                 }
285                                 // Two quotes assimilate to one
286                         }
287                         // Fall through to pushing the character
288                 } else if (c == fmt->quote) {
289                         quoted = 1;
290                         continue;
291                 } else if (c == fmt->fs && !quoted) {
292                         ensure_field(i);
293                         new_field(i);
294                         continue;
295                 }
296                 ensure_field(i);
297                 *line_push(&in_line) = c;
298                 in_field->len++;
299         }
300 }
301
302 static int is_ws(int c)
303 {
304         return (c == ' ' || c == '\t' || c == '\f');
305 }
306
307 static void csv_write(struct format *fmt)
308 {
309         for (int i=0; i < fields_count(&out_fields); i++) {
310                 int len;
311                 unsigned char *p = get_field(&out_fields, i, &len);
312
313                 int need_quotes = 0;
314                 if (fmt->quote >= 0) {
315                         need_quotes = fmt->always_quote;
316                         for (int j=0; !need_quotes && j < len; j++) {
317                                 if (p[j] == fmt->fs || p[j] == fmt->quote)
318                                         need_quotes = 1;
319                         }
320                 }
321                 if (i)
322                         putchar_unlocked(fmt->fs);
323                 if (need_quotes)
324                         putchar_unlocked(fmt->quote);
325                 for (int j=0; j < len; j++) {
326                         int c = p[j];
327                         if (c == fmt->fs && !need_quotes)
328                                 warn(fmt, "Field separator found inside field and quoting is turned off.");
329                         if (c == fmt->quote)
330                                 putchar_unlocked(c);
331                         putchar_unlocked(c);
332                 }
333                 if (need_quotes)
334                         putchar_unlocked(fmt->quote);
335         }
336         putchar_unlocked('\n');
337 }
338
339 /*** White-space back-end ***/
340
341 static int ws_read(struct format *fmt)
342 {
343         if (!next_line())
344                 return 0;
345
346         unsigned char *line = line_first(&in_line);
347         int n = line_count(&in_line);
348         if (!n)
349                 return 1;
350
351         int ws = 0;
352         new_field(0);
353         for (int i=0; i<n; i++) {
354                 int c = line[i];
355                 if (is_ws(c)) {
356                         ws++;
357                 } else {
358                         if (ws) {
359                                 if (!in_field->start_pos &&
360                                     !in_field->len &&
361                                     fmt->sloppy)
362                                         in_field->start_pos = i;
363                                 else
364                                         new_field(i);
365                                 ws = 0;
366                         }
367                         in_field->len++;
368                 }
369         }
370
371         if (ws && !fmt->sloppy)
372                 new_field(n);
373         return 1;
374 }
375
376 /*** Regex back-end ***/
377
378 static const char *regex_set(struct format *f, char *rx)
379 {
380         const char *err;
381         int errpos;
382         f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
383         if (!f->pcre)
384                 return err;
385
386         f->pcre_extra = pcre_study(f->pcre, 0, &err);
387         if (!f->pcre_extra)
388                 return err;
389
390         return NULL;
391 }
392
393 static int regex_read(struct format *fmt)
394 {
395         if (!next_line())
396                 return 0;
397
398         unsigned char *c = line_first(&in_line);
399         int n = line_count(&in_line);
400         if (!n)
401                 return 1;
402
403         int i = 0;
404         for (;;) {
405                 int ovec[3];
406                 int err = pcre_exec(fmt->pcre, fmt->pcre_extra, (char *) c, n, i, 0, ovec, 3);
407                 if (err < 0) {
408                         if (err != PCRE_ERROR_NOMATCH)
409                                 warn(fmt, "PCRE matching error %d", err);
410                         // No further occurrence of the separator: the rest is a single field
411                         if (!fmt->sloppy || i < n) {
412                                 new_field(i);
413                                 in_field->len = n - i;
414                         }
415                         return 1;
416                 }
417                 if (ovec[0] == ovec[1]) {
418                         warn(fmt, "Regular expression matched an empty separator.");
419                         new_field(i);
420                         in_field->len = n - i;
421                         return 1;
422                 }
423                 if (!fmt->sloppy || ovec[0]) {
424                         new_field(i);
425                         in_field->len = ovec[0] - i;
426                 }
427                 i = ovec[1];
428         }
429 }
430
431 /*** Table back-end ***/
432
433 static void table_write(struct format *fmt)
434 {
435         for (int i = 0; i < intarray_count(&column_widths); i++) {
436                 if (fmt->table_grid) {
437                         putchar_unlocked('|');
438                         printf("%*s", fmt->table_sep / 2, "");
439                 } else if (i)
440                         printf("%*s", fmt->table_sep, "");
441
442                 int cw = *intarray_nth(&column_widths, i);
443                 int fw = 0;
444                 if (i < fields_count(&out_fields)) {
445                         int len;
446                         unsigned char *p = get_field(&out_fields, i, &len);
447                         fw = field_chars(fields_nth(&out_fields, i));
448                         if (fw > cw) {
449                                 warn(fmt, "Internal error: Wrongly calculated width of column %d (%d > %d)", i, fw, cw);
450                                 cw = fw;
451                         }
452                         while (len--)
453                                 putchar_unlocked(*p++);
454                 }
455                 while (fw < cw) {
456                         putchar_unlocked(' ');
457                         fw++;
458                 }
459
460                 if (fmt->table_grid)
461                         printf("%*s", fmt->table_sep - fmt->table_sep / 2, "");
462         }
463
464         if (fmt->table_grid)
465                 putchar_unlocked('|');
466         putchar_unlocked('\n');
467 }
468
469 static void table_write_grid(struct format *fmt, int pos UNUSED)
470 {
471         if (!fmt->table_grid)
472                 return;
473
474         for (int i = 0; i < intarray_count(&column_widths); i++) {
475                 putchar_unlocked('+');
476                 int w = fmt->table_sep + *intarray_nth(&column_widths, i);
477                 while (w--)
478                         putchar_unlocked('-');
479         }
480         putchar_unlocked('+');
481         putchar_unlocked('\n');
482 }
483
484 /*** Temporary file back-end ***/
485
486 static int tmp_read(struct format *fmt)
487 {
488         FILE *tf = fmt->tmp_file;
489
490         for (;;) {
491                 int c = getc_unlocked(tf);
492                 if (c < 0)
493                         return 0;
494                 if (c == 0xff)
495                         return 1;
496                 if (c == 0xfe) {
497                         c = getc_unlocked(tf);
498                         c = (c << 8) | getc_unlocked(tf);
499                         c = (c << 8) | getc_unlocked(tf);
500                         c = (c << 8) | getc_unlocked(tf);
501                 }
502                 new_field(line_count(&in_line));
503                 in_field->len = c;
504                 while (c--) {
505                         int x = getc_unlocked(tf);
506                         if (x < 0)
507                                 die("Truncated temporary file");
508                         *line_push(&in_line) = x;
509                 }
510         }
511
512         if (ferror_unlocked(tf))
513                 die("I/O error when reading temporary file");
514 }
515
516 static void tmp_write(struct format *fmt)
517 {
518         FILE *tf = fmt->tmp_file;
519
520         for (int i = 0; i < fields_count(&out_fields); i++) {
521                 int len;
522                 unsigned char *p = get_field(&out_fields, i, &len);
523
524                 if (len < 0xfe)
525                         putc_unlocked(len, tf);
526                 else {
527                         putc_unlocked(0xfe, tf);
528                         putc_unlocked((len >> 24) & 0xff, tf);
529                         putc_unlocked((len >> 16) & 0xff, tf);
530                         putc_unlocked((len >> 8) & 0xff, tf);
531                         putc_unlocked(len & 0xff, tf);
532                 }
533
534                 while (len--)
535                         putc_unlocked(*p++, tf);
536         }
537         putc_unlocked(0xff, tf);
538
539         if (ferror_unlocked(tf))
540                 die("I/O error when writing temporary file");
541 }
542
543 /*** Transforms ***/
544
545 static void trim_fields(void)
546 {
547         unsigned char *line = line_first(&in_line);
548         for (int i = 0; i < fields_count(&in_fields); i++) {
549                 struct field *f = fields_nth(&in_fields, i);
550                 while (f->len && is_ws(line[f->start_pos]))
551                         f->start_pos++, f->len--;
552                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
553                         f->len--;
554         }
555 }
556
557 static void equalize_fields(void)
558 {
559         while (fields_count(&out_fields) < intarray_count(&column_widths)) {
560                 struct field *f = fields_push(&out_fields);
561                 f->start_pos = f->len = 0;
562         }
563 }
564
565 /*** Field names and headers ***/
566
567 struct field_names {
568         stringarray_t names;
569 };
570
571 static void add_field(struct field_names *fn, char *name, int namelen)
572 {
573         char *n = xmalloc(namelen + 1);
574         memcpy(n, name, namelen);
575         n[namelen] = 0;
576         *stringarray_push(&fn->names) = n;
577 }
578
579 static void add_field_names(struct field_names *fn, char *names)
580 {
581         char *p = names;
582         while (p) {
583                 char *q = strchr(p, ',');
584                 int len = q ? q-p : (int) strlen(p);
585                 add_field(fn, p, len);
586                 p = q ? q+1 : NULL;
587         }
588 }
589
590 static void read_header(void)
591 {
592         if (!(in_format->has_header || in_format->set_field_names))
593                 return;
594
595         struct field_names *fn = xmalloc_zero(sizeof(*fn));
596         in_format->field_names = fn;
597
598         if (in_format->has_header) {
599                 if (!read_line())
600                         die("Missing input header");
601         }
602
603         if (in_format->set_field_names) {
604                 add_field_names(fn, in_format->set_field_names);
605         } else {
606                 for (int i = 0; i < fields_count(&in_fields); i++) {
607                         int len;
608                         char *s = (char *) get_field(&in_fields, i, &len);
609                         add_field(fn, s, len);
610                 }
611         }
612 }
613
614 static void write_header(void)
615 {
616         if (!out_format->has_header) {
617                 write_grid(-1);
618                 return;
619         }
620
621         int want_select_fields = 0;
622         if (out_format->set_field_names) {
623                 struct field_names *fn = xmalloc_zero(sizeof(*fn));
624                 out_format->field_names = fn;
625                 add_field_names(fn, out_format->set_field_names);
626         } else if (in_format->field_names) {
627                 out_format->field_names = in_format->field_names;
628                 want_select_fields = 1;
629         } else
630                 die("Output header requested, but no field names specified");
631
632         line_reset(&in_line);
633         fields_reset(&in_fields);
634         struct field_names *fn = out_format->field_names;
635         for (int i = 0; i < stringarray_count(&fn->names); i++) {
636                 struct field *f = fields_push(&in_fields);
637                 f->start_pos = line_count(&in_line);
638                 f->len = 0;
639                 char *s = *stringarray_nth(&fn->names, i);
640                 while (*s) {
641                         *line_push(&in_line) = *s++;
642                         f->len++;
643                 }
644         }
645
646         fields_reset(&out_fields);
647         if (want_select_fields)
648                 select_fields();
649         else
650                 select_all_fields();
651
652         // This is tricky: when we are formatting a table, field names are normally
653         // calculated in pass 1, but the header is written in pass 2, so we have to
654         // update column statistics, because field name can be too wide to fit.
655         want_stats++;
656         update_stats();
657         want_stats--;
658         if (want_equalize)
659                 equalize_fields();
660         write_grid(-1);
661         write_line();
662         write_grid(0);
663 }
664
665 static void write_footer(void)
666 {
667         write_grid(1);
668 }
669
670 static int find_field_by_name(struct field_names *fn, char *name)
671 {
672         for (int i = 0; i < stringarray_count(&fn->names); i++)
673                 if (!strcmp(*stringarray_nth(&fn->names, i), name))
674                         return i + 1;
675         return -1;
676 }
677
678 /*** Field selection ***/
679
680 struct selector {
681         int first_field, last_field;            // 0 means "boundary"
682 };
683
684 DECLARE_BUF(selectors, struct selector);
685 static selectors_t selectors;
686
687 static int parse_field_num(char *str)
688 {
689         int f = 0;
690
691         while (*str) {
692                 if (*str < '0' || *str > '9')
693                         return -1;
694                 if (f >= 100000000)
695                         return -1;
696                 f = 10*f + *str - '0';
697                 str++;
698         }
699         return f;
700 }
701
702 static int parse_field(char *str)
703 {
704         if (!*str)
705                 return 0;
706
707         int f = parse_field_num(str);
708         if (f > 0)
709                 return f;
710
711         if (in_format->field_names && (f = find_field_by_name(in_format->field_names, str)) > 0)
712                 return f;
713
714         die("Unknown field `%s'", str);
715 }
716
717 static char *parse_selector(char *str)
718 {
719         char buf[strlen(str) + 1];
720         strcpy(buf, str);
721
722         struct selector *s = selectors_push(&selectors);
723         char *sep = strchr(buf, '-');
724         if (sep) {
725                 *sep++ = 0;
726                 s->first_field = parse_field(buf);
727                 s->last_field = parse_field(sep);
728         } else
729                 s->first_field = s->last_field = parse_field(buf);
730
731         return NULL;
732 }
733
734 static void finish_parse_selectors(void)
735 {
736         if (!selectors_count(&selectors))
737                 parse_selector("-");
738 }
739
740 static void select_fields(void)
741 {
742         for (int i = 0; i < selectors_count(&selectors); i++) {
743                 struct selector *s = selectors_nth(&selectors, i);
744                 int first = s->first_field;
745                 if (first <= 0)
746                         first = 1;
747                 int last = s->last_field;
748                 if (last <= 0)
749                         last = fields_count(&in_fields);
750                 for (int j = first; j <= last; j++) {
751                         struct field *f = fields_push(&out_fields);
752                         if (j >= 1 && j <= fields_count(&in_fields))
753                                 *f = *fields_nth(&in_fields, j-1);
754                         else
755                                 f->start_pos = f->len = 0;
756                 }
757         }
758 }
759
760 static void select_all_fields(void)
761 {
762         for (int i = 0; i < fields_count(&in_fields); i++)
763                 *fields_push(&out_fields) = *fields_nth(&in_fields, i);
764 }
765
766 /*** Processing of files ***/
767
768 static void one_pass(int pass)
769 {
770         if (pass & 2)
771                 write_header();
772
773         for (;;) {
774                 line_number++;
775                 if (!read_line())
776                         break;
777
778                 if (want_trim && (pass & 1))
779                         trim_fields();
780
781                 fields_reset(&out_fields);
782                 if (pass & 1)
783                         select_fields();
784                 else
785                         select_all_fields();
786
787                 if (want_equalize && (pass & 2))
788                         equalize_fields();
789                 update_stats();
790                 write_line();
791         }
792
793         if (pass & 2)
794                 write_footer();
795 }
796
797 static void two_pass(void)
798 {
799         struct format *final_format = out_format;
800
801         // We need to use character set info from the current locale
802         setlocale(LC_CTYPE, "");
803
804         // Pass 1: Set up writer of intermediate format
805         out_format = xmalloc_zero(sizeof(*out_format));
806         out_format->id = FORM_TMP;
807         out_format->read_line = tmp_read;
808         out_format->write_line = tmp_write;
809         out_format->tmp_file = tmpfile();
810         out_format->field_names = in_format->field_names;
811         one_pass(1);
812
813         // Pass 2: Set up reader of intermediate format
814         in_format = out_format;
815         rewind(in_format->tmp_file);
816         line_number = 0;
817         out_format = final_format;
818         want_stats = 0;
819         one_pass(2);
820         fclose(in_format->tmp_file);
821 }
822
823 /*** Parsing of arguments ***/
824
825 static void NONRET usage(void)
826 {
827         printf("\
828 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
829 \n\
830 Formats:\n\
831 -t, --tsv               Tab-separated values (default)\n\
832 -c, --csv               Comma-separated values\n\
833 -w, --ws                Values separated by arbitrary whitespace\n\
834 -r, --regex=<rx>        Separator given by Perl regular expression (input only)\n\
835     --table             Format a table (output only)\n\
836 \n\
837 Format parameters:\n\
838 -d, --fs=<char>         Delimiter of fields\n\
839 -f, --fields=<f>,...    Set field names\n\
840 -h, --header            The first line contains field names\n\
841 -q, --quiet             Do not show warnings\n\
842     --always-quote      Put quotes around all fields (CSV output only)\n\
843     --table-sep=<n>     Separate table columns by <n> spaces (default: 2)\n\
844     --grid              Separate table columns by grid lines\n\
845 -s, --sloppy            Ignore separators at the start/end of line (ws/regex only)\n\
846 \n\
847 Other options:\n\
848     --trim              Trim leading and trailing whitespaces in fields\n\
849     --equalize          Pad all lines to the maximum number of fields\n\
850 ");
851         exit(0);
852 }
853
854 static void NONRET bad_args(const char *msg, ...)
855 {
856         if (msg) {
857                 va_list args;
858                 va_start(args, msg);
859                 fprintf(stderr, "xsv: ");
860                 vfprintf(stderr, msg, args);
861                 fputc('\n', stderr);
862                 va_end(args);
863         }
864         fprintf(stderr, "Try `xsv --help' for more information.\n");
865         exit(1);
866 }
867
868 static const char short_options[] = "cd:f:hqr:twW";
869
870 enum long_options {
871         OPT_HELP = 256,
872         OPT_VERSION,
873         OPT_TRIM,
874         OPT_ALWAYS_QUOTE,
875         OPT_TABLE,
876         OPT_TABLE_SEP,
877         OPT_GRID,
878         OPT_EQUALIZE,
879 };
880
881 static const struct option long_options[] = {
882         { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
883         { "csv",                0,      NULL,   'c' },
884         { "equalize",           0,      NULL,   OPT_EQUALIZE },
885         { "fields",             1,      NULL,   'f' },
886         { "fs",                 1,      NULL,   'd' },
887         { "grid",               0,      NULL,   OPT_GRID },
888         { "header",             0,      NULL,   'h' },
889         { "help",               0,      NULL,   OPT_HELP },
890         { "quiet",              0,      NULL,   'q' },
891         { "regex",              1,      NULL,   'r' },
892         { "sloppy",             0,      NULL,   's' },
893         { "table",              0,      NULL,   OPT_TABLE },
894         { "table-sep",          1,      NULL,   OPT_TABLE_SEP },
895         { "trim",               0,      NULL,   OPT_TRIM },
896         { "tsv",                0,      NULL,   't' },
897         { "version",            0,      NULL,   OPT_VERSION },
898         { "ws",                 0,      NULL,   'w' },
899         { NULL,                 0,      NULL,   0 },
900 };
901
902 static void set_format(int format_id)
903 {
904         struct format *f = xmalloc_zero(sizeof(*f));
905         f->id = format_id;
906
907         switch (format_id) {
908                 case FORM_TSV:
909                         f->fs = '\t';
910                         f->quote = -1;
911                         f->read_line = csv_read;
912                         f->write_line = csv_write;
913                         break;
914                 case FORM_CSV:
915                         f->fs = ',';
916                         f->quote = '"';
917                         f->read_line = csv_read;
918                         f->write_line = csv_write;
919                         break;
920                 case FORM_WS:
921                         f->fs = ' ';
922                         f->quote = -1;
923                         f->read_line = ws_read;
924                         f->write_line = csv_write;
925                         break;
926                 case FORM_REGEX:
927                         f->read_line = regex_read;
928                         break;
929                 case FORM_TABLE:
930                         f->write_line = table_write;
931                         f->write_grid = table_write_grid;
932                         f->needs_stats = 1;
933                         f->table_sep = 2;
934                         break;
935         }
936
937         if (!in_format)
938                 in_format = f;
939         else if (!out_format)
940                 out_format = f;
941         else
942                 bad_args("At most two formats may be given.");
943 }
944
945 static struct format *current_format(void)
946 {
947         if (out_format)
948                 return out_format;
949         if (in_format)
950                 return in_format;
951         set_format(FORM_TSV);
952         return in_format;
953 }
954
955 int main(int argc, char **argv)
956 {
957         int opt;
958         const char *err;
959
960         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
961                 switch (opt) {
962                         case 'c':
963                                 set_format(FORM_CSV);
964                                 break;
965                         case 'd':
966                                 if (optarg[0])
967                                         current_format()->fs = optarg[0];
968                                 else
969                                         bad_args("No field delimiter given.");
970                                 break;
971                         case 'f':
972                                 current_format()->set_field_names = optarg;
973                                 break;
974                         case 'h':
975                                 current_format()->has_header = 1;
976                                 break;
977                         case 'q':
978                                 current_format()->quiet = 1;
979                                 break;
980                         case 'r':
981                                 set_format(FORM_REGEX);
982                                 err = regex_set(current_format(), optarg);
983                                 if (err)
984                                         bad_args("Error compiling regex: %s", err);
985                                 break;
986                         case 's':
987                                 if (current_format()->id != FORM_WS && current_format()->id != FORM_REGEX)
988                                         bad_args("--sloppy makes sense only for --ws or --regex.");
989                                 current_format()->sloppy = 1;
990                                 break;
991                         case 't':
992                                 set_format(FORM_TSV);
993                                 break;
994                         case 'w':
995                                 set_format(FORM_WS);
996                                 break;
997                         case OPT_ALWAYS_QUOTE:
998                                 if (current_format()->id != FORM_CSV)
999                                         bad_args("--always-quote makes sense only for --csv.");
1000                                 current_format()->always_quote = 1;
1001                                 break;
1002                         case OPT_HELP:
1003                                 usage();
1004                         case OPT_VERSION:
1005                                 puts("This is xsv version " VERSION ".");
1006                                 exit(0);
1007                         case OPT_TRIM:
1008                                 want_trim = 1;
1009                                 break;
1010                         case OPT_TABLE:
1011                                 set_format(FORM_TABLE);
1012                                 break;
1013                         case OPT_TABLE_SEP:
1014                                 current_format()->table_sep = atoi(optarg);
1015                                 break;
1016                         case OPT_GRID:
1017                                 current_format()->table_grid = 1;
1018                                 break;
1019                         case OPT_EQUALIZE:
1020                                 want_equalize = 1;
1021                                 break;
1022                         default:
1023                                 bad_args(NULL);
1024                 }
1025
1026         current_format();
1027         if (!out_format)
1028                 out_format = in_format;
1029         if (!in_format->read_line)
1030                 bad_args("Write-only format selected for input.");
1031         if (!out_format->write_line)
1032                 bad_args("Read-only format selected for output.");
1033         read_header();
1034
1035         for (int i = optind; i < argc; i++) {
1036                 err = parse_selector(argv[i]);
1037                 if (err)
1038                         bad_args(err);
1039         }
1040         finish_parse_selectors();
1041
1042         want_stats = out_format->needs_stats | want_equalize;
1043         if (want_stats)
1044                 two_pass();
1045         else
1046                 one_pass(3);
1047         return 0;
1048 }