]> mj.ucw.cz Git - xsv.git/blob - xsv.c
4ad9e477714db146b918eb600e8f5ba4d9f10301
[xsv.git] / xsv.c
1 /*
2  *      A Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <stdarg.h>
11 #include <getopt.h>
12
13 #include <pcre.h>
14
15 /*** Memory allocation ***/
16
17 static void *xmalloc(size_t bytes)
18 {
19         void *p = malloc(bytes);
20         if (!p) {
21                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
22                 exit(1);
23         }
24         return p;
25 }
26
27 static void *xrealloc(void *old, size_t bytes)
28 {
29         void *p = realloc(old, bytes);
30         if (!p) {
31                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
32                 exit(1);
33         }
34         return p;
35 }
36
37 #define DECLARE_BUF(name, type) \
38         typedef struct { type *start; int count; int max; } name##_t;                           \
39         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
40         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
41         static inline int name##_count(name##_t *b) { return b->count; }                        \
42         static void name##_extend(name##_t *b) {                                                \
43                 b->max = b->max ? 2*b->max : 16;                                                \
44                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
45         }                                                                                       \
46         static inline type *name##_push(name##_t *b) {                                          \
47                 if (b->count >= b->max) name##_extend(b);                                       \
48                 return &b->start[b->count++];                                                   \
49         }                                                                                       \
50         static inline type *name##_first(name##_t *b) { return b->start; }                      \
51         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
52         // end
53
54 /*** Formats and their parameters ***/
55
56 enum format_id {
57         FORM_UNSPEC,
58         FORM_TSV,
59         FORM_CSV,
60         FORM_WS,
61         FORM_REGEX,
62 };
63
64 struct format {
65         enum format_id id;
66         int fs;
67         int quote;
68         int quiet;
69         int (*read_line)(void);
70         void (*write_line)(void);
71         // CSV backend:
72         int always_quote;
73         // regex backend:
74         pcre *pcre;
75         pcre_extra *pcre_extra;
76 };
77
78 static struct format *in_format, *out_format;
79
80 struct field {
81         int start_pos;
82         int len;
83 };
84
85 DECLARE_BUF(fields, struct field);
86 DECLARE_BUF(line, unsigned char);
87
88 static fields_t in_fields, out_fields;
89 static struct field *in_field;
90 static line_t in_line;
91 static int line_number;
92
93 static void new_field(int pos)
94 {
95         in_field = fields_push(&in_fields);
96         in_field->start_pos = pos;
97         in_field->len = 0;
98 }
99
100 static void ensure_field(int pos)
101 {
102         if (!in_field)
103                 new_field(pos);
104 }
105
106 static void warn(struct format *fmt, char *msg, ...)
107 {
108         if (!fmt->quiet) {
109                 fprintf(stderr, "Warning at line %d: ", line_number);
110                 va_list args;
111                 va_start(args, msg);
112                 vfprintf(stderr, args, msg);
113                 va_end(args);
114                 fputc('\n', stderr);
115         }
116 }
117
118 static int next_line(void)
119 {
120         for (;;) {
121                 int c = getchar();
122                 if (c == '\r')
123                         continue;
124                 if (c < 0)
125                         return !!line_count(&in_line);
126                 if (c == '\n')
127                         return 1;
128                 *line_push(&in_line) = c;
129         }
130 }
131
132 static int csv_read(void)
133 {
134         int quoted = 0;
135         for (;;) {
136                 int c = getchar();
137                 int i = line_count(&in_line);
138 restart:
139                 if (c == '\r')
140                         continue;
141                 if (c < 0 || c == '\n') {
142                         if (quoted)
143                                 warn(in_format, "Missing closing quote.");
144                         if (c < 0)
145                                 return !!fields_count(&in_fields);
146                         else
147                                 return 1;
148                 }
149                 if (quoted) {
150                         if (c == in_format->quote) {
151                                 c = getchar();
152                                 if (c != in_format->quote) {
153                                         quoted = 0;
154                                         goto restart;
155                                 }
156                                 // Two quotes assimilate to one
157                         }
158                         // Fall through to pushing the character
159                 } else if (c == in_format->quote) {
160                         quoted = 1;
161                         continue;
162                 } else if (c == in_format->fs && !quoted) {
163                         ensure_field(i);
164                         new_field(i);
165                         continue;
166                 }
167                 ensure_field(i);
168                 *line_push(&in_line) = c;
169                 in_field->len++;
170         }
171 }
172
173 static int is_ws(int c)
174 {
175         return (c == ' ' || c == '\t' || c == '\f');
176 }
177
178 static void csv_write(void)
179 {
180         unsigned char *line = line_first(&in_line);
181         int n = fields_count(&out_fields);
182         for (int i=0; i<n; i++) {
183                 struct field *f = fields_nth(&out_fields, i);
184                 int need_quotes = 0;
185                 if (out_format->quote >= 0) {
186                         need_quotes = out_format->always_quote;
187                         for (int j=0; !need_quotes && j < f->len; j++) {
188                                 int c = line[f->start_pos + j];
189                                 if (c == out_format->fs || c == out_format->quote)
190                                         need_quotes = 1;
191                         }
192                 }
193                 if (i)
194                         putchar(out_format->fs);
195                 if (need_quotes)
196                         putchar(out_format->quote);
197                 for (int j=0; j < f->len; j++) {
198                         int c = line[f->start_pos + j];
199                         if (c == out_format->fs && !need_quotes)
200                                 warn(out_format, "Field separator found inside field and quoting is turned off.");
201                         if (c == out_format->quote)
202                                 putchar(c);
203                         putchar(c);
204                 }
205                 if (need_quotes)
206                         putchar(out_format->quote);
207         }
208         putchar('\n');
209 }
210
211 static int ws_read(void)
212 {
213         if (!next_line())
214                 return 0;
215
216         unsigned char *line = line_first(&in_line);
217         int n = line_count(&in_line);
218         if (!n)
219                 return 1;
220
221         int ws = 0;
222         new_field(0);
223         for (int i=0; i<n; i++) {
224                 int c = line[i];
225                 if (is_ws(c)) {
226                         ws++;
227                 } else {
228                         if (ws) {
229                                 new_field(i);
230                                 ws = 0;
231                         }
232                         in_field->len++;
233                 }
234         }
235
236         if (ws)
237                 new_field(n);
238         return 1;
239 }
240
241 static const char *regex_set(struct format *f, char *rx)
242 {
243         const char *err;
244         int errpos;
245         f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
246         if (!f->pcre)
247                 return err;
248
249         f->pcre_extra = pcre_study(f->pcre, 0, &err);
250         if (!f->pcre_extra)
251                 return err;
252
253         return NULL;
254 }
255
256 static int regex_read(void)
257 {
258         if (!next_line())
259                 return 0;
260
261         unsigned char *c = line_first(&in_line);
262         int n = line_count(&in_line);
263         if (!n)
264                 return 1;
265
266         int i = 0;
267         for (;;) {
268                 int ovec[3];
269                 int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3);
270                 if (sep < 0) {
271                         if (sep != PCRE_ERROR_NOMATCH)
272                                 warn(in_format, "PCRE matching error %d", sep);
273                         // No further occurrence of the separator: the rest is a single field
274                         new_field(i);
275                         in_field->len = n - i;
276                         return 1;
277                 }
278                 new_field(i);
279                 in_field->len = ovec[0] - i;
280                 i = ovec[1];
281         }
282 }
283
284 /*** Transforms ***/
285
286 static void trim_fields(void)
287 {
288         unsigned char *line = line_first(&in_line);
289         for (int i = 0; i < fields_count(&in_fields); i++) {
290                 struct field *f = fields_nth(&in_fields, i);
291                 while (f->len && is_ws(line[f->start_pos]))
292                         f->start_pos++, f->len--;
293                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
294                         f->len--;
295         }
296 }
297
298 /*** Field selection ***/
299
300 struct selector {
301         int first_field, last_field;
302 };
303
304 DECLARE_BUF(selectors, struct selector);
305 static selectors_t selectors;
306
307 static char *parse_selector(char *str)
308 {
309         char buf[strlen(str) + 1];
310         strcpy(buf, str);
311
312         struct selector *s = selectors_push(&selectors);
313         char *sep = strchr(buf, '-');
314         if (sep) {
315                 *sep++ = 0;
316                 s->first_field = atoi(buf);
317                 s->last_field = atoi(sep);
318         } else
319                 s->first_field = s->last_field = atoi(buf);
320
321         return NULL;
322 }
323
324 static void finish_parse_selectors(void)
325 {
326         if (!selectors_count(&selectors))
327                 parse_selector("-");
328 }
329
330 static void select_fields(void)
331 {
332         for (int i = 0; i < selectors_count(&selectors); i++) {
333                 struct selector *s = selectors_nth(&selectors, i);
334                 int first = s->first_field;
335                 if (first <= 0)
336                         first = 1;
337                 int last = s->last_field;
338                 if (last <= 0)
339                         last = fields_count(&in_fields);
340                 for (int j = first; j <= last; j++) {
341                         struct field *f = fields_push(&out_fields);
342                         if (j >= 1 && j <= fields_count(&in_fields))
343                                 *f = *fields_nth(&in_fields, j-1);
344                         else
345                                 f->start_pos = f->len = 0;
346                 }
347         }
348 }
349
350 /*** Parsing of arguments ***/
351
352 static void usage(void)
353 {
354         printf("\
355 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
356 \n\
357 Formats:\n\
358 -t, --tsv               TAB-separated values (default)\n\
359 -c, --csv               Comma-separated values\n\
360 -w, --ws                Values separated by arbitrary whitespace\n\
361 -r, --regex=<rx>        Separator given by Perl regular expression (input only)\n\
362 \n\
363 Format parameters:\n\
364 -d, --fs=<char>         Delimiter of fields\n\
365 -q, --quiet             Do not show warnings\n\
366     --always-quote      Put quotes around all fields (CSV output only)\n\
367 \n\
368 Other options:\n\
369     --trim              Trim leading and trailing whitespaces in fields\n\
370 ");
371         exit(0);
372 }
373
374 static void bad_args(const char *msg, ...)
375 {
376         if (msg) {
377                 va_list args;
378                 va_start(args, msg);
379                 fprintf(stderr, "xsv: ");
380                 vfprintf(stderr, msg, args);
381                 fputc('\n', stderr);
382                 va_end(args);
383         }
384         fprintf(stderr, "Try `xsv --help' for more information.\n");
385         exit(1);
386 }
387
388 static const char short_options[] = "cd:qr:tw";
389
390 enum long_options {
391         OPT_HELP = 256,
392         OPT_TRIM,
393         OPT_ALWAYS_QUOTE,
394 };
395
396 static const struct option long_options[] = {
397         { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
398         { "csv",                0,      NULL,   'c' },
399         { "fs",                 1,      NULL,   'd' },
400         { "quiet",              0,      NULL,   'q' },
401         { "regex",              1,      NULL,   'r' },
402         { "trim",               0,      NULL,   OPT_TRIM },
403         { "tsv",                0,      NULL,   't' },
404         { "ws",                 0,      NULL,   'w' },
405         { "help",               0,      NULL,   OPT_HELP },
406         { NULL,                 0,      NULL,   0 },
407 };
408
409 static void set_format(int format_id)
410 {
411         struct format *f = xmalloc(sizeof(*f));
412         memset(f, 0, sizeof(*f));
413         f->id = format_id;
414
415         switch (format_id) {
416                 case FORM_TSV:
417                         f->fs = '\t';
418                         f->quote = -1;
419                         f->read_line = csv_read;
420                         f->write_line = csv_write;
421                         break;
422                 case FORM_CSV:
423                         f->fs = ',';
424                         f->quote = '"';
425                         f->read_line = csv_read;
426                         f->write_line = csv_write;
427                         break;
428                 case FORM_WS:
429                         f->fs = ' ';
430                         f->quote = -1;
431                         f->read_line = ws_read;
432                         f->write_line = csv_write;
433                         break;
434                 case FORM_REGEX:
435                         f->read_line = regex_read;
436                         break;
437         }
438
439         if (!in_format)
440                 in_format = f;
441         else if (!out_format)
442                 out_format = f;
443         else
444                 bad_args("At most two format may be given.");
445 }
446
447 static struct format *current_format(void)
448 {
449         if (out_format)
450                 return out_format;
451         if (in_format)
452                 return in_format;
453         set_format(FORM_TSV);
454         return in_format;
455 }
456
457 int main(int argc, char **argv)
458 {
459         int opt;
460         int want_trim = 0;
461         const char *err;
462
463         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
464                 switch (opt) {
465                         case 'c':
466                                 set_format(FORM_CSV);
467                                 break;
468                         case 'd':
469                                 if (optarg[0])
470                                         current_format()->fs = optarg[0];
471                                 else
472                                         bad_args("No field delimiter given.");
473                                 break;
474                         case 'q':
475                                 current_format()->quiet = 1;
476                                 break;
477                         case 'r':
478                                 set_format(FORM_REGEX);
479                                 err = regex_set(current_format(), optarg);
480                                 if (err)
481                                         bad_args("Error compiling regex: %s", err);
482                                 break;
483                         case 't':
484                                 set_format(FORM_TSV);
485                                 break;
486                         case 'w':
487                                 set_format(FORM_WS);
488                                 break;
489                         case OPT_ALWAYS_QUOTE:
490                                 if (current_format()->id != FORM_CSV)
491                                         bad_args("--always-quote makes sense only for CSV.");
492                                 current_format()->always_quote = 1;
493                                 break;
494                         case OPT_HELP:
495                                 usage();
496                         case OPT_TRIM:
497                                 want_trim = 1;
498                                 break;
499                         default:
500                                 bad_args(NULL);
501                 }
502
503         current_format();
504         if (!out_format)
505                 out_format = in_format;
506         if (!in_format->read_line)
507                 bad_args("Write-only format selected for input.");
508         if (!out_format->write_line)
509                 bad_args("Read-only format selected for output.");
510
511         for (int i = optind; i < argc; i++) {
512                 err = parse_selector(argv[i]);
513                 if (err)
514                         bad_args(err);
515         }
516         finish_parse_selectors();
517
518         fields_init(&in_fields);
519         fields_init(&out_fields);
520         line_init(&in_line);
521
522         for (;;) {
523                 line_number++;
524                 fields_reset(&in_fields);
525                 line_reset(&in_line);
526                 in_field = NULL;
527                 if (!in_format->read_line())
528                         break;
529
530                 if (want_trim)
531                         trim_fields();
532
533                 fields_reset(&out_fields);
534                 select_fields();
535
536                 out_format->write_line();
537         }
538
539         return 0;
540 }