]> mj.ucw.cz Git - xsv.git/blob - xsv.c
6263da080216392d87dd42cf2a407ca2ecc3865c
[xsv.git] / xsv.c
1 /*
2  *      A Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <stdarg.h>
11 #include <getopt.h>
12
13 #include <pcre.h>
14
15 /*** Memory allocation ***/
16
17 static void *xmalloc(size_t bytes)
18 {
19         void *p = malloc(bytes);
20         if (!p) {
21                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
22                 exit(1);
23         }
24         return p;
25 }
26
27 static void *xrealloc(void *old, size_t bytes)
28 {
29         void *p = realloc(old, bytes);
30         if (!p) {
31                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
32                 exit(1);
33         }
34         return p;
35 }
36
37 #define DECLARE_BUF(name, type) \
38         typedef struct { type *start; int count; int max; } name##_t;                           \
39         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
40         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
41         static inline int name##_count(name##_t *b) { return b->count; }                        \
42         static void name##_extend(name##_t *b) {                                                \
43                 b->max = b->max ? 2*b->max : 16;                                                \
44                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
45         }                                                                                       \
46         static inline type *name##_push(name##_t *b) {                                          \
47                 if (b->count >= b->max) name##_extend(b);                                       \
48                 return &b->start[b->count++];                                                   \
49         }                                                                                       \
50         static inline type *name##_first(name##_t *b) { return b->start; }                      \
51         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
52         // end
53
54 /*** Formats and their parameters ***/
55
56 enum format_id {
57         FORM_UNSPEC,
58         FORM_TSV,
59         FORM_CSV,
60         FORM_WS,
61         FORM_REGEX,
62 };
63
64 struct format {
65         enum format_id id;
66         int fs;
67         int quote;
68         int quiet;
69         int (*read_line)(void);
70         void (*write_line)(void);
71         // CSV backend:
72         int always_quote;
73         // WS backend:
74         int strict_ws;
75         // regex backend:
76         pcre *pcre;
77         pcre_extra *pcre_extra;
78 };
79
80 static struct format *in_format, *out_format;
81
82 struct field {
83         int start_pos;
84         int len;
85 };
86
87 DECLARE_BUF(fields, struct field);
88 DECLARE_BUF(line, unsigned char);
89
90 static fields_t in_fields, out_fields;
91 static struct field *in_field;
92 static line_t in_line;
93 static int line_number;
94
95 static void new_field(int pos)
96 {
97         in_field = fields_push(&in_fields);
98         in_field->start_pos = pos;
99         in_field->len = 0;
100 }
101
102 static void ensure_field(int pos)
103 {
104         if (!in_field)
105                 new_field(pos);
106 }
107
108 static void warn(struct format *fmt, char *msg, ...)
109 {
110         if (!fmt->quiet) {
111                 fprintf(stderr, "Warning at line %d: ", line_number);
112                 va_list args;
113                 va_start(args, msg);
114                 vfprintf(stderr, args, msg);
115                 va_end(args);
116                 fputc('\n', stderr);
117         }
118 }
119
120 static int next_line(void)
121 {
122         for (;;) {
123                 int c = getchar();
124                 if (c == '\r')
125                         continue;
126                 if (c < 0)
127                         return !!line_count(&in_line);
128                 if (c == '\n')
129                         return 1;
130                 *line_push(&in_line) = c;
131         }
132 }
133
134 static int csv_read(void)
135 {
136         int quoted = 0;
137         for (;;) {
138                 int c = getchar();
139                 int i = line_count(&in_line);
140 restart:
141                 if (c == '\r')
142                         continue;
143                 if (c < 0 || c == '\n') {
144                         if (quoted)
145                                 warn(in_format, "Missing closing quote.");
146                         if (c < 0)
147                                 return !!fields_count(&in_fields);
148                         else
149                                 return 1;
150                 }
151                 if (quoted) {
152                         if (c == in_format->quote) {
153                                 c = getchar();
154                                 if (c != in_format->quote) {
155                                         quoted = 0;
156                                         goto restart;
157                                 }
158                                 // Two quotes assimilate to one
159                         }
160                         // Fall through to pushing the character
161                 } else if (c == in_format->quote) {
162                         quoted = 1;
163                         continue;
164                 } else if (c == in_format->fs && !quoted) {
165                         ensure_field(i);
166                         new_field(i);
167                         continue;
168                 }
169                 ensure_field(i);
170                 *line_push(&in_line) = c;
171                 in_field->len++;
172         }
173 }
174
175 static int is_ws(int c)
176 {
177         return (c == ' ' || c == '\t' || c == '\f');
178 }
179
180 static void csv_write(void)
181 {
182         unsigned char *line = line_first(&in_line);
183         int n = fields_count(&out_fields);
184         for (int i=0; i<n; i++) {
185                 struct field *f = fields_nth(&out_fields, i);
186                 int need_quotes = 0;
187                 if (out_format->quote >= 0) {
188                         need_quotes = out_format->always_quote;
189                         for (int j=0; !need_quotes && j < f->len; j++) {
190                                 int c = line[f->start_pos + j];
191                                 if (c == out_format->fs || c == out_format->quote)
192                                         need_quotes = 1;
193                         }
194                 }
195                 if (i)
196                         putchar(out_format->fs);
197                 if (need_quotes)
198                         putchar(out_format->quote);
199                 for (int j=0; j < f->len; j++) {
200                         int c = line[f->start_pos + j];
201                         if (c == out_format->fs && !need_quotes)
202                                 warn(out_format, "Field separator found inside field and quoting is turned off.");
203                         if (c == out_format->quote)
204                                 putchar(c);
205                         putchar(c);
206                 }
207                 if (need_quotes)
208                         putchar(out_format->quote);
209         }
210         putchar('\n');
211 }
212
213 static int ws_read(void)
214 {
215         if (!next_line())
216                 return 0;
217
218         unsigned char *line = line_first(&in_line);
219         int n = line_count(&in_line);
220         if (!n)
221                 return 1;
222
223         int ws = 0;
224         new_field(0);
225         for (int i=0; i<n; i++) {
226                 int c = line[i];
227                 if (is_ws(c)) {
228                         ws++;
229                 } else {
230                         if (ws) {
231                                 if (!in_field->start_pos &&
232                                     !in_field->len &&
233                                     !in_format->strict_ws)
234                                         in_field->start_pos = i;
235                                 else
236                                         new_field(i);
237                                 ws = 0;
238                         }
239                         in_field->len++;
240                 }
241         }
242
243         if (ws && in_format->strict_ws)
244                 new_field(n);
245         return 1;
246 }
247
248 static const char *regex_set(struct format *f, char *rx)
249 {
250         const char *err;
251         int errpos;
252         f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
253         if (!f->pcre)
254                 return err;
255
256         f->pcre_extra = pcre_study(f->pcre, 0, &err);
257         if (!f->pcre_extra)
258                 return err;
259
260         return NULL;
261 }
262
263 static int regex_read(void)
264 {
265         if (!next_line())
266                 return 0;
267
268         unsigned char *c = line_first(&in_line);
269         int n = line_count(&in_line);
270         if (!n)
271                 return 1;
272
273         int i = 0;
274         for (;;) {
275                 int ovec[3];
276                 int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3);
277                 if (sep < 0) {
278                         if (sep != PCRE_ERROR_NOMATCH)
279                                 warn(in_format, "PCRE matching error %d", sep);
280                         // No further occurrence of the separator: the rest is a single field
281                         new_field(i);
282                         in_field->len = n - i;
283                         return 1;
284                 }
285                 new_field(i);
286                 in_field->len = ovec[0] - i;
287                 i = ovec[1];
288         }
289 }
290
291 /*** Transforms ***/
292
293 static void trim_fields(void)
294 {
295         unsigned char *line = line_first(&in_line);
296         for (int i = 0; i < fields_count(&in_fields); i++) {
297                 struct field *f = fields_nth(&in_fields, i);
298                 while (f->len && is_ws(line[f->start_pos]))
299                         f->start_pos++, f->len--;
300                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
301                         f->len--;
302         }
303 }
304
305 /*** Field selection ***/
306
307 struct selector {
308         int first_field, last_field;
309 };
310
311 DECLARE_BUF(selectors, struct selector);
312 static selectors_t selectors;
313
314 static char *parse_selector(char *str)
315 {
316         char buf[strlen(str) + 1];
317         strcpy(buf, str);
318
319         struct selector *s = selectors_push(&selectors);
320         char *sep = strchr(buf, '-');
321         if (sep) {
322                 *sep++ = 0;
323                 s->first_field = atoi(buf);
324                 s->last_field = atoi(sep);
325         } else
326                 s->first_field = s->last_field = atoi(buf);
327
328         return NULL;
329 }
330
331 static void finish_parse_selectors(void)
332 {
333         if (!selectors_count(&selectors))
334                 parse_selector("-");
335 }
336
337 static void select_fields(void)
338 {
339         for (int i = 0; i < selectors_count(&selectors); i++) {
340                 struct selector *s = selectors_nth(&selectors, i);
341                 int first = s->first_field;
342                 if (first <= 0)
343                         first = 1;
344                 int last = s->last_field;
345                 if (last <= 0)
346                         last = fields_count(&in_fields);
347                 for (int j = first; j <= last; j++) {
348                         struct field *f = fields_push(&out_fields);
349                         if (j >= 1 && j <= fields_count(&in_fields))
350                                 *f = *fields_nth(&in_fields, j-1);
351                         else
352                                 f->start_pos = f->len = 0;
353                 }
354         }
355 }
356
357 /*** Parsing of arguments ***/
358
359 static void usage(void)
360 {
361         printf("\
362 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
363 \n\
364 Formats:\n\
365 -t, --tsv               TAB-separated values (default)\n\
366 -c, --csv               Comma-separated values\n\
367 -w, --ws                Values separated by arbitrary whitespace\n\
368 -W, --strict-ws         Like --ws, but recognize empty columns at start/end\n\
369 -r, --regex=<rx>        Separator given by Perl regular expression (input only)\n\
370 \n\
371 Format parameters:\n\
372 -d, --fs=<char>         Delimiter of fields\n\
373 -q, --quiet             Do not show warnings\n\
374     --always-quote      Put quotes around all fields (CSV output only)\n\
375 \n\
376 Other options:\n\
377     --trim              Trim leading and trailing whitespaces in fields\n\
378 ");
379         exit(0);
380 }
381
382 static void bad_args(const char *msg, ...)
383 {
384         if (msg) {
385                 va_list args;
386                 va_start(args, msg);
387                 fprintf(stderr, "xsv: ");
388                 vfprintf(stderr, msg, args);
389                 fputc('\n', stderr);
390                 va_end(args);
391         }
392         fprintf(stderr, "Try `xsv --help' for more information.\n");
393         exit(1);
394 }
395
396 static const char short_options[] = "cd:qr:twW";
397
398 enum long_options {
399         OPT_HELP = 256,
400         OPT_TRIM,
401         OPT_ALWAYS_QUOTE,
402 };
403
404 static const struct option long_options[] = {
405         { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
406         { "csv",                0,      NULL,   'c' },
407         { "fs",                 1,      NULL,   'd' },
408         { "quiet",              0,      NULL,   'q' },
409         { "regex",              1,      NULL,   'r' },
410         { "strict-ws",          0,      NULL,   'W' },
411         { "trim",               0,      NULL,   OPT_TRIM },
412         { "tsv",                0,      NULL,   't' },
413         { "ws",                 0,      NULL,   'w' },
414         { "help",               0,      NULL,   OPT_HELP },
415         { NULL,                 0,      NULL,   0 },
416 };
417
418 static void set_format(int format_id)
419 {
420         struct format *f = xmalloc(sizeof(*f));
421         memset(f, 0, sizeof(*f));
422         f->id = format_id;
423
424         switch (format_id) {
425                 case FORM_TSV:
426                         f->fs = '\t';
427                         f->quote = -1;
428                         f->read_line = csv_read;
429                         f->write_line = csv_write;
430                         break;
431                 case FORM_CSV:
432                         f->fs = ',';
433                         f->quote = '"';
434                         f->read_line = csv_read;
435                         f->write_line = csv_write;
436                         break;
437                 case FORM_WS:
438                         f->fs = ' ';
439                         f->quote = -1;
440                         f->read_line = ws_read;
441                         f->write_line = csv_write;
442                         break;
443                 case FORM_REGEX:
444                         f->read_line = regex_read;
445                         break;
446         }
447
448         if (!in_format)
449                 in_format = f;
450         else if (!out_format)
451                 out_format = f;
452         else
453                 bad_args("At most two format may be given.");
454 }
455
456 static struct format *current_format(void)
457 {
458         if (out_format)
459                 return out_format;
460         if (in_format)
461                 return in_format;
462         set_format(FORM_TSV);
463         return in_format;
464 }
465
466 int main(int argc, char **argv)
467 {
468         int opt;
469         int want_trim = 0;
470         const char *err;
471
472         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
473                 switch (opt) {
474                         case 'c':
475                                 set_format(FORM_CSV);
476                                 break;
477                         case 'd':
478                                 if (optarg[0])
479                                         current_format()->fs = optarg[0];
480                                 else
481                                         bad_args("No field delimiter given.");
482                                 break;
483                         case 'q':
484                                 current_format()->quiet = 1;
485                                 break;
486                         case 'r':
487                                 set_format(FORM_REGEX);
488                                 err = regex_set(current_format(), optarg);
489                                 if (err)
490                                         bad_args("Error compiling regex: %s", err);
491                                 break;
492                         case 't':
493                                 set_format(FORM_TSV);
494                                 break;
495                         case 'w':
496                                 set_format(FORM_WS);
497                                 break;
498                         case 'W':
499                                 set_format(FORM_WS);
500                                 current_format()->strict_ws = 1;
501                                 break;
502                         case OPT_ALWAYS_QUOTE:
503                                 if (current_format()->id != FORM_CSV)
504                                         bad_args("--always-quote makes sense only for CSV.");
505                                 current_format()->always_quote = 1;
506                                 break;
507                         case OPT_HELP:
508                                 usage();
509                         case OPT_TRIM:
510                                 want_trim = 1;
511                                 break;
512                         default:
513                                 bad_args(NULL);
514                 }
515
516         current_format();
517         if (!out_format)
518                 out_format = in_format;
519         if (!in_format->read_line)
520                 bad_args("Write-only format selected for input.");
521         if (!out_format->write_line)
522                 bad_args("Read-only format selected for output.");
523
524         for (int i = optind; i < argc; i++) {
525                 err = parse_selector(argv[i]);
526                 if (err)
527                         bad_args(err);
528         }
529         finish_parse_selectors();
530
531         fields_init(&in_fields);
532         fields_init(&out_fields);
533         line_init(&in_line);
534
535         for (;;) {
536                 line_number++;
537                 fields_reset(&in_fields);
538                 line_reset(&in_line);
539                 in_field = NULL;
540                 if (!in_format->read_line())
541                         break;
542
543                 if (want_trim)
544                         trim_fields();
545
546                 fields_reset(&out_fields);
547                 select_fields();
548
549                 out_format->write_line();
550         }
551
552         return 0;
553 }