]> mj.ucw.cz Git - xsv.git/blob - xsv.c
Added --regex format and --always-quote for --csv
[xsv.git] / xsv.c
1 /*
2  *      A Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <stdarg.h>
11 #include <getopt.h>
12
13 #include <pcre.h>
14
15 /*** Memory allocation ***/
16
17 static void *xmalloc(size_t bytes)
18 {
19         void *p = malloc(bytes);
20         if (!p) {
21                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
22                 exit(1);
23         }
24         return p;
25 }
26
27 static void *xrealloc(void *old, size_t bytes)
28 {
29         void *p = realloc(old, bytes);
30         if (!p) {
31                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
32                 exit(1);
33         }
34         return p;
35 }
36
37 #define DECLARE_BUF(name, type) \
38         typedef struct { type *start; int count; int max; } name##_t;                           \
39         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
40         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
41         static inline int name##_count(name##_t *b) { return b->count; }                        \
42         static void name##_extend(name##_t *b) {                                                \
43                 b->max = b->max ? 2*b->max : 16;                                                \
44                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
45         }                                                                                       \
46         static inline type *name##_push(name##_t *b) {                                          \
47                 if (b->count >= b->max) name##_extend(b);                                       \
48                 return &b->start[b->count++];                                                   \
49         }                                                                                       \
50         static inline type *name##_first(name##_t *b) { return b->start; }                      \
51         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
52         // end
53
54 /*** Formats and their parameters ***/
55
56 enum format_id {
57         FORM_UNSPEC,
58         FORM_TSV,
59         FORM_CSV,
60         FORM_WS,
61         FORM_REGEX,
62 };
63
64 struct format {
65         enum format_id id;
66         int fs;
67         int quote;
68         int quiet;
69         int (*read_line)(void);
70         void (*write_line)(void);
71         // CSV backend:
72         int always_quote;
73         // regex backend:
74         pcre *pcre;
75         pcre_extra *pcre_extra;
76 };
77
78 static struct format *in_format, *out_format;
79
80 struct field {
81         int start_pos;
82         int len;
83 };
84
85 DECLARE_BUF(fields, struct field);
86 DECLARE_BUF(line, unsigned char);
87
88 static fields_t in_fields, out_fields;
89 static struct field *in_field;
90 static line_t in_line;
91 static int line_number;
92
93 static void new_field(void)
94 {
95         in_field = fields_push(&in_fields);
96         in_field->start_pos = line_count(&in_line);
97         in_field->len = 0;
98 }
99
100 static void ensure_field(void)
101 {
102         if (!in_field)
103                 new_field();
104 }
105
106 static void warn(struct format *fmt, char *msg, ...)
107 {
108         if (!fmt->quiet) {
109                 fprintf(stderr, "Warning at line %d: ", line_number);
110                 va_list args;
111                 va_start(args, msg);
112                 vfprintf(stderr, args, msg);
113                 va_end(args);
114                 fputc('\n', stderr);
115         }
116 }
117
118 static int next_line(void)
119 {
120         for (;;) {
121                 int c = getchar();
122                 if (c == '\r')
123                         continue;
124                 if (c < 0)
125                         return !!line_count(&in_line);
126                 if (c == '\n')
127                         return 1;
128                 *line_push(&in_line) = c;
129         }
130 }
131
132 static int csv_read(void)
133 {
134         int quoted = 0;
135         for (;;) {
136                 int c = getchar();
137 restart:
138                 if (c == '\r')
139                         continue;
140                 if (c < 0 || c == '\n') {
141                         if (quoted)
142                                 warn(in_format, "Missing closing quote.");
143                         if (c < 0)
144                                 return !!fields_count(&in_fields);
145                         else
146                                 return 1;
147                 }
148                 if (quoted) {
149                         if (c == in_format->quote) {
150                                 c = getchar();
151                                 if (c != in_format->quote) {
152                                         quoted = 0;
153                                         goto restart;
154                                 }
155                                 // Two quotes assimilate to one
156                         }
157                         // Fall through to pushing the character
158                 } else if (c == in_format->quote) {
159                         quoted = 1;
160                         continue;
161                 } else if (c == in_format->fs && !quoted) {
162                         ensure_field();
163                         new_field();
164                         continue;
165                 }
166                 ensure_field();
167                 *line_push(&in_line) = c;
168                 in_field->len++;
169         }
170 }
171
172 static int is_ws(int c)
173 {
174         return (c == ' ' || c == '\t' || c == '\f');
175 }
176
177 static void csv_write(void)
178 {
179         unsigned char *line = line_first(&in_line);
180         int n = fields_count(&out_fields);
181         for (int i=0; i<n; i++) {
182                 struct field *f = fields_nth(&out_fields, i);
183                 int need_quotes = 0;
184                 if (out_format->quote >= 0) {
185                         need_quotes = out_format->always_quote;
186                         for (int j=0; !need_quotes && j < f->len; j++) {
187                                 int c = line[f->start_pos + j];
188                                 if (c == out_format->fs || c == out_format->quote)
189                                         need_quotes = 1;
190                         }
191                 }
192                 if (i)
193                         putchar(out_format->fs);
194                 if (need_quotes)
195                         putchar(out_format->quote);
196                 for (int j=0; j < f->len; j++) {
197                         int c = line[f->start_pos + j];
198                         if (c == out_format->fs && !need_quotes)
199                                 warn(out_format, "Field separator found inside field and quoting is turned off.");
200                         if (c == out_format->quote)
201                                 putchar(c);
202                         putchar(c);
203                 }
204                 if (need_quotes)
205                         putchar(out_format->quote);
206         }
207         putchar('\n');
208 }
209
210 static int ws_read(void)
211 {
212         int ws = 0;
213         for (;;) {
214                 int c = getchar();
215                 if (c < 0)
216                         return !!fields_count(&in_fields);
217                 if (c == '\r')
218                         continue;
219                 if (c == '\n')
220                         return 1;
221                 if (is_ws(c)) {
222                         ensure_field();
223                         if (!ws)
224                                 new_field();
225                         ws++;
226                 } else {
227                         ensure_field();
228                         *line_push(&in_line) = c;
229                         in_field->len++;
230                         ws = 0;
231                 }
232         }
233 }
234
235 static const char *regex_set(struct format *f, char *rx)
236 {
237         const char *err;
238         int errpos;
239         f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
240         if (!f->pcre)
241                 return err;
242
243         f->pcre_extra = pcre_study(f->pcre, 0, &err);
244         if (!f->pcre_extra)
245                 return err;
246
247         return NULL;
248 }
249
250 static int regex_read(void)
251 {
252         if (!next_line())
253                 return 0;
254
255         unsigned char *c = line_first(&in_line);
256         int n = line_count(&in_line);
257         if (!n)
258                 return 1;
259
260         int i = 0;
261         for (;;) {
262                 int ovec[3];
263                 int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3);
264                 if (sep < 0) {
265                         if (sep != PCRE_ERROR_NOMATCH)
266                                 warn(in_format, "PCRE matching error %d", sep);
267                         // No further occurrence of the separator: the rest is a single field
268                         new_field();
269                         in_field->start_pos = i;
270                         in_field->len = n - i;
271                         return 1;
272                 }
273                 new_field();
274                 in_field->start_pos = i;
275                 in_field->len = ovec[0] - i;
276                 i = ovec[1];
277         }
278 }
279
280 /*** Transforms ***/
281
282 static void trim_fields(void)
283 {
284         unsigned char *line = line_first(&in_line);
285         for (int i = 0; i < fields_count(&in_fields); i++) {
286                 struct field *f = fields_nth(&in_fields, i);
287                 while (f->len && is_ws(line[f->start_pos]))
288                         f->start_pos++, f->len--;
289                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
290                         f->len--;
291         }
292 }
293
294 /*** Field selection ***/
295
296 struct selector {
297         int first_field, last_field;
298 };
299
300 DECLARE_BUF(selectors, struct selector);
301 static selectors_t selectors;
302
303 static char *parse_selector(char *str)
304 {
305         char buf[strlen(str) + 1];
306         strcpy(buf, str);
307
308         struct selector *s = selectors_push(&selectors);
309         char *sep = strchr(buf, '-');
310         if (sep) {
311                 *sep++ = 0;
312                 s->first_field = atoi(buf);
313                 s->last_field = atoi(sep);
314         } else
315                 s->first_field = s->last_field = atoi(buf);
316
317         return NULL;
318 }
319
320 static void finish_parse_selectors(void)
321 {
322         if (!selectors_count(&selectors))
323                 parse_selector("-");
324 }
325
326 static void select_fields(void)
327 {
328         for (int i = 0; i < selectors_count(&selectors); i++) {
329                 struct selector *s = selectors_nth(&selectors, i);
330                 int first = s->first_field;
331                 if (first <= 0)
332                         first = 1;
333                 int last = s->last_field;
334                 if (last <= 0)
335                         last = fields_count(&in_fields);
336                 for (int j = first; j <= last; j++) {
337                         struct field *f = fields_push(&out_fields);
338                         if (j >= 1 && j <= fields_count(&in_fields))
339                                 *f = *fields_nth(&in_fields, j-1);
340                         else
341                                 f->start_pos = f->len = 0;
342                 }
343         }
344 }
345
346 /*** Parsing of arguments ***/
347
348 static void usage(void)
349 {
350         printf("\
351 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
352 \n\
353 Formats:\n\
354 -t, --tsv               TAB-separated values (default)\n\
355 -c, --csv               Comma-separated values\n\
356 -w, --ws                Values separated by arbitrary whitespace\n\
357 -r, --regex=<rx>        Separator given by Perl regular expression (input only)\n\
358 \n\
359 Format parameters:\n\
360 -d, --fs=<char>         Delimiter of fields\n\
361 -q, --quiet             Do not show warnings\n\
362     --always-quote      Put quotes around all fields (CSV output only)\n\
363 \n\
364 Other options:\n\
365     --trim              Trim leading and trailing whitespaces in fields\n\
366 ");
367         exit(0);
368 }
369
370 static void bad_args(const char *msg, ...)
371 {
372         if (msg) {
373                 va_list args;
374                 va_start(args, msg);
375                 fprintf(stderr, "xsv: ");
376                 vfprintf(stderr, msg, args);
377                 fputc('\n', stderr);
378                 va_end(args);
379         }
380         fprintf(stderr, "Try `xsv --help' for more information.\n");
381         exit(1);
382 }
383
384 static const char short_options[] = "cd:qr:tw";
385
386 enum long_options {
387         OPT_HELP = 256,
388         OPT_TRIM,
389         OPT_ALWAYS_QUOTE,
390 };
391
392 static const struct option long_options[] = {
393         { "always-quote",       0,      NULL,   OPT_ALWAYS_QUOTE },
394         { "csv",                0,      NULL,   'c' },
395         { "fs",                 1,      NULL,   'd' },
396         { "quiet",              0,      NULL,   'q' },
397         { "regex",              1,      NULL,   'r' },
398         { "trim",               0,      NULL,   OPT_TRIM },
399         { "tsv",                0,      NULL,   't' },
400         { "ws",                 0,      NULL,   'w' },
401         { "help",               0,      NULL,   OPT_HELP },
402         { NULL,                 0,      NULL,   0 },
403 };
404
405 static void set_format(int format_id)
406 {
407         struct format *f = xmalloc(sizeof(*f));
408         memset(f, 0, sizeof(*f));
409         f->id = format_id;
410
411         switch (format_id) {
412                 case FORM_TSV:
413                         f->fs = '\t';
414                         f->quote = -1;
415                         f->read_line = csv_read;
416                         f->write_line = csv_write;
417                         break;
418                 case FORM_CSV:
419                         f->fs = ',';
420                         f->quote = '"';
421                         f->read_line = csv_read;
422                         f->write_line = csv_write;
423                         break;
424                 case FORM_WS:
425                         f->fs = ' ';
426                         f->quote = -1;
427                         f->read_line = ws_read;
428                         f->write_line = csv_write;
429                         break;
430                 case FORM_REGEX:
431                         f->read_line = regex_read;
432                         break;
433         }
434
435         if (!in_format)
436                 in_format = f;
437         else if (!out_format)
438                 out_format = f;
439         else
440                 bad_args("At most two format may be given.");
441 }
442
443 static struct format *current_format(void)
444 {
445         if (out_format)
446                 return out_format;
447         if (in_format)
448                 return in_format;
449         set_format(FORM_TSV);
450         return in_format;
451 }
452
453 int main(int argc, char **argv)
454 {
455         int opt;
456         int want_trim = 0;
457         const char *err;
458
459         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
460                 switch (opt) {
461                         case 'c':
462                                 set_format(FORM_CSV);
463                                 break;
464                         case 'd':
465                                 if (optarg[0])
466                                         current_format()->fs = optarg[0];
467                                 else
468                                         bad_args("No field delimiter given.");
469                                 break;
470                         case 'q':
471                                 current_format()->quiet = 1;
472                                 break;
473                         case 'r':
474                                 set_format(FORM_REGEX);
475                                 err = regex_set(current_format(), optarg);
476                                 if (err)
477                                         bad_args("Error compiling regex: %s", err);
478                                 break;
479                         case 't':
480                                 set_format(FORM_TSV);
481                                 break;
482                         case 'w':
483                                 set_format(FORM_WS);
484                                 break;
485                         case OPT_ALWAYS_QUOTE:
486                                 if (current_format()->id != FORM_CSV)
487                                         bad_args("--always-quote makes sense only for CSV.");
488                                 current_format()->always_quote = 1;
489                                 break;
490                         case OPT_HELP:
491                                 usage();
492                         case OPT_TRIM:
493                                 want_trim = 1;
494                                 break;
495                         default:
496                                 bad_args(NULL);
497                 }
498
499         current_format();
500         if (!out_format)
501                 out_format = in_format;
502         if (!in_format->read_line)
503                 bad_args("Write-only format selected for input.");
504         if (!out_format->write_line)
505                 bad_args("Read-only format selected for output.");
506
507         for (int i = optind; i < argc; i++) {
508                 err = parse_selector(argv[i]);
509                 if (err)
510                         bad_args(err);
511         }
512         finish_parse_selectors();
513
514         fields_init(&in_fields);
515         fields_init(&out_fields);
516         line_init(&in_line);
517
518         for (;;) {
519                 line_number++;
520                 fields_reset(&in_fields);
521                 line_reset(&in_line);
522                 in_field = NULL;
523                 if (!in_format->read_line())
524                         break;
525
526                 if (want_trim)
527                         trim_fields();
528
529                 fields_reset(&out_fields);
530                 select_fields();
531
532                 out_format->write_line();
533         }
534
535         return 0;
536 }