]> mj.ucw.cz Git - xsv.git/blob - xsv.c
Added warnings and --quiet
[xsv.git] / xsv.c
1 /*
2  *      A Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <getopt.h>
11
12 /*** Memory allocation ***/
13
14 static void *xmalloc(size_t bytes)
15 {
16         void *p = malloc(bytes);
17         if (!p) {
18                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
19                 exit(1);
20         }
21         return p;
22 }
23
24 static void *xrealloc(void *old, size_t bytes)
25 {
26         void *p = realloc(old, bytes);
27         if (!p) {
28                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
29                 exit(1);
30         }
31         return p;
32 }
33
34 #define DECLARE_BUF(name, type) \
35         typedef struct { type *start; int count; int max; } name##_t;                           \
36         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
37         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
38         static inline int name##_count(name##_t *b) { return b->count; }                        \
39         static void name##_extend(name##_t *b) {                                                \
40                 b->max = b->max ? 2*b->max : 16;                                                \
41                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
42         }                                                                                       \
43         static inline type *name##_push(name##_t *b) {                                          \
44                 if (b->count >= b->max) name##_extend(b);                                       \
45                 return &b->start[b->count++];                                                   \
46         }                                                                                       \
47         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
48         // end
49
50 /*** Formats and their parameters ***/
51
52 enum format_id {
53         FORM_UNSPEC,
54         FORM_TSV,
55         FORM_CSV,
56         FORM_WS,
57 };
58
59 struct format {
60         enum format_id id;
61         int fs;
62         int quote;
63         int quiet;
64         int (*read_line)(void);
65         void (*write_line)(void);
66 };
67
68 static struct format *in_format, *out_format;
69
70 struct field {
71         int start_pos;
72         int len;
73 };
74
75 DECLARE_BUF(fields, struct field);
76 DECLARE_BUF(line, unsigned char);
77
78 static fields_t in_fields, out_fields;
79 static struct field *in_field;
80 static line_t in_line;
81 static int line_number;
82
83 static void new_field(void)
84 {
85         in_field = fields_push(&in_fields);
86         in_field->start_pos = line_count(&in_line);
87         in_field->len = 0;
88 }
89
90 static void ensure_field(void)
91 {
92         if (!in_field)
93                 new_field();
94 }
95
96 static void warn(struct format *fmt, char *msg)
97 {
98         if (!fmt->quiet)
99                 fprintf(stderr, "Warning at line %d: %s\n", line_number, msg);
100 }
101
102 static int csv_read(void)
103 {
104         int quoted = 0;
105         for (;;) {
106                 int c = getchar();
107 restart:
108                 if (c == '\r')
109                         continue;
110                 if (c < 0 || c == '\n') {
111                         if (quoted)
112                                 warn(in_format, "Missing closing quote.");
113                         if (c < 0)
114                                 return !!fields_count(&in_fields);
115                         else
116                                 return 1;
117                 }
118                 if (quoted) {
119                         if (c == in_format->quote) {
120                                 c = getchar();
121                                 if (c != in_format->quote) {
122                                         quoted = 0;
123                                         goto restart;
124                                 }
125                                 // Two quotes assimilate to one
126                         }
127                         // Fall through to pushing the character
128                 } else if (c == in_format->quote) {
129                         quoted = 1;
130                         continue;
131                 } else if (c == in_format->fs && !quoted) {
132                         ensure_field();
133                         new_field();
134                         continue;
135                 }
136                 ensure_field();
137                 *line_push(&in_line) = c;
138                 in_field->len++;
139         }
140 }
141
142 static int is_ws(int c)
143 {
144         return (c == ' ' || c == '\t' || c == '\f');
145 }
146
147 static void csv_write(void)
148 {
149         unsigned char *line = line_nth(&in_line, 0);
150         int n = fields_count(&out_fields);
151         for (int i=0; i<n; i++) {
152                 struct field *f = fields_nth(&out_fields, i);
153                 int need_quotes = 0;
154                 if (out_format->quote >= 0) {
155                         for (int j=0; j < f->len; j++) {
156                                 int c = line[f->start_pos + j];
157                                 if (c == out_format->fs || c == out_format->quote) {
158                                         need_quotes = 1;
159                                         break;
160                                 }
161                         }
162                 }
163                 if (i)
164                         putchar(out_format->fs);
165                 if (need_quotes)
166                         putchar(out_format->quote);
167                 for (int j=0; j < f->len; j++) {
168                         int c = line[f->start_pos + j];
169                         if (c == out_format->fs && !need_quotes)
170                                 warn(out_format, "Field separator found inside field and quoting is turned off.");
171                         if (c == out_format->quote)
172                                 putchar(c);
173                         putchar(c);
174                 }
175                 if (need_quotes)
176                         putchar(out_format->quote);
177         }
178         putchar('\n');
179 }
180
181 static int ws_read(void)
182 {
183         int ws = 0;
184         for (;;) {
185                 int c = getchar();
186                 if (c < 0)
187                         return !!fields_count(&in_fields);
188                 if (c == '\r')
189                         continue;
190                 if (c == '\n')
191                         return 1;
192                 if (is_ws(c)) {
193                         ensure_field();
194                         if (!ws)
195                                 new_field();
196                         ws++;
197                 } else {
198                         ensure_field();
199                         *line_push(&in_line) = c;
200                         in_field->len++;
201                         ws = 0;
202                 }
203         }
204 }
205
206 /*** Transforms ***/
207
208 static void trim_fields(void)
209 {
210         unsigned char *line = line_nth(&in_line, 0);
211         for (int i = 0; i < fields_count(&in_fields); i++) {
212                 struct field *f = fields_nth(&in_fields, i);
213                 while (f->len && is_ws(line[f->start_pos]))
214                         f->start_pos++, f->len--;
215                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
216                         f->len--;
217         }
218 }
219
220 /*** Field selection ***/
221
222 struct selector {
223         int first_field, last_field;
224 };
225
226 DECLARE_BUF(selectors, struct selector);
227 static selectors_t selectors;
228
229 static char *parse_selector(char *str)
230 {
231         char buf[strlen(str) + 1];
232         strcpy(buf, str);
233
234         struct selector *s = selectors_push(&selectors);
235         char *sep = strchr(buf, '-');
236         if (sep) {
237                 *sep++ = 0;
238                 s->first_field = atoi(buf);
239                 s->last_field = atoi(sep);
240         } else
241                 s->first_field = s->last_field = atoi(buf);
242
243         return NULL;
244 }
245
246 static void finish_parse_selectors(void)
247 {
248         if (!selectors_count(&selectors))
249                 parse_selector("-");
250 }
251
252 static void select_fields(void)
253 {
254         for (int i = 0; i < selectors_count(&selectors); i++) {
255                 struct selector *s = selectors_nth(&selectors, i);
256                 int first = s->first_field;
257                 if (first <= 0)
258                         first = 1;
259                 int last = s->last_field;
260                 if (last <= 0)
261                         last = fields_count(&in_fields);
262                 for (int j = first; j <= last; j++) {
263                         struct field *f = fields_push(&out_fields);
264                         if (j >= 1 && j <= fields_count(&in_fields))
265                                 *f = *fields_nth(&in_fields, j-1);
266                         else
267                                 f->start_pos = f->len = 0;
268                 }
269         }
270 }
271
272 /*** Parsing of arguments ***/
273
274 static void usage(void)
275 {
276         printf("\
277 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
278 \n\
279 Formats:\n\
280 -t, --tsv               TAB-separated values (default)\n\
281 -c, --csv               Comma-separated values\n\
282 -w, --ws                Values separated by arbitrary whitespace\n\
283 \n\
284 Format parameters:\n\
285 -d, --fs=<char>         Delimiter of fields\n\
286 -q, --quiet             Do not show warnings\n\
287 \n\
288 Other options:\n\
289     --trim              Trim leading and trailing whitespaces in fields\n\
290 ");
291         exit(0);
292 }
293
294 static void bad_args(char *msg)
295 {
296         if (msg)
297                 fprintf(stderr, "xsv: %s\n", msg);
298         fprintf(stderr, "Try `xsv --help' for more information.\n");
299         exit(1);
300 }
301
302 static const char short_options[] = "cd:qtw";
303
304 enum long_options {
305         OPT_HELP = 256,
306         OPT_TRIM = 257,
307 };
308
309 static const struct option long_options[] = {
310         { "csv",                0,      NULL,   'c' },
311         { "fs",                 1,      NULL,   'd' },
312         { "quiet",              0,      NULL,   'q' },
313         { "trim",               0,      NULL,   OPT_TRIM },
314         { "tsv",                0,      NULL,   't' },
315         { "ws",                 0,      NULL,   'w' },
316         { "help",               0,      NULL,   OPT_HELP },
317         { NULL,                 0,      NULL,   0 },
318 };
319
320 static void set_format(int format_id)
321 {
322         struct format *f = xmalloc(sizeof(*f));
323         memset(f, 0, sizeof(*f));
324         f->id = format_id;
325
326         switch (format_id) {
327                 case FORM_TSV:
328                         f->fs = '\t';
329                         f->quote = -1;
330                         f->read_line = csv_read;
331                         f->write_line = csv_write;
332                         break;
333                 case FORM_CSV:
334                         f->fs = ',';
335                         f->quote = '"';
336                         f->read_line = csv_read;
337                         f->write_line = csv_write;
338                         break;
339                 case FORM_WS:
340                         f->fs = ' ';
341                         f->quote = -1;
342                         f->read_line = ws_read;
343                         f->write_line = csv_write;
344                         break;
345         }
346
347         if (!in_format)
348                 in_format = f;
349         else if (!out_format)
350                 out_format = f;
351         else
352                 bad_args("At most two format may be given.");
353 }
354
355 static struct format *current_format(void)
356 {
357         if (out_format)
358                 return out_format;
359         if (in_format)
360                 return in_format;
361         set_format(FORM_TSV);
362         return in_format;
363 }
364
365 int main(int argc, char **argv)
366 {
367         int opt;
368         int want_trim = 0;
369
370         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
371                 switch (opt) {
372                         case 'c':
373                                 set_format(FORM_CSV);
374                                 break;
375                         case 'd':
376                                 if (optarg[0])
377                                         current_format()->fs = optarg[0];
378                                 else
379                                         bad_args("No field delimiter given.");
380                                 break;
381                         case 'q':
382                                 current_format()->quiet = 1;
383                                 break;
384                         case 't':
385                                 set_format(FORM_TSV);
386                                 break;
387                         case 'w':
388                                 set_format(FORM_WS);
389                                 break;
390                         case OPT_HELP:
391                                 usage();
392                         case OPT_TRIM:
393                                 want_trim = 1;
394                                 break;
395                         default:
396                                 bad_args(NULL);
397                 }
398
399         current_format();
400         if (!out_format)
401                 out_format = in_format;
402
403         for (int i = optind; i < argc; i++) {
404                 char *err = parse_selector(argv[i]);
405                 if (err)
406                         bad_args(err);
407         }
408         finish_parse_selectors();
409
410         fields_init(&in_fields);
411         fields_init(&out_fields);
412         line_init(&in_line);
413
414         for (;;) {
415                 line_number++;
416                 fields_reset(&in_fields);
417                 line_reset(&in_line);
418                 in_field = NULL;
419                 if (!in_format->read_line())
420                         break;
421
422                 if (want_trim)
423                         trim_fields();
424
425                 fields_reset(&out_fields);
426                 select_fields();
427
428                 out_format->write_line();
429         }
430
431         return 0;
432 }