2 * A Swiss-Army Knife for CSV-like Files
4 * (c) 2012 Martin Mares <mj@ucw.cz>
15 /*** Memory allocation ***/
17 static void *xmalloc(size_t bytes)
19 void *p = malloc(bytes);
21 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
27 static void *xrealloc(void *old, size_t bytes)
29 void *p = realloc(old, bytes);
31 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
37 #define DECLARE_BUF(name, type) \
38 typedef struct { type *start; int count; int max; } name##_t; \
39 static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
40 static inline void name##_reset(name##_t *b) { b->count = 0; } \
41 static inline int name##_count(name##_t *b) { return b->count; } \
42 static void name##_extend(name##_t *b) { \
43 b->max = b->max ? 2*b->max : 16; \
44 b->start = xrealloc(b->start, b->max * sizeof(type)); \
46 static inline type *name##_push(name##_t *b) { \
47 if (b->count >= b->max) name##_extend(b); \
48 return &b->start[b->count++]; \
50 static inline type *name##_first(name##_t *b) { return b->start; } \
51 static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \
54 /*** Formats and their parameters ***/
69 int (*read_line)(void);
70 void (*write_line)(void);
77 pcre_extra *pcre_extra;
80 static struct format *in_format, *out_format;
87 DECLARE_BUF(fields, struct field);
88 DECLARE_BUF(line, unsigned char);
90 static fields_t in_fields, out_fields;
91 static struct field *in_field;
92 static line_t in_line;
93 static int line_number;
95 static void new_field(int pos)
97 in_field = fields_push(&in_fields);
98 in_field->start_pos = pos;
102 static void ensure_field(int pos)
108 static void warn(struct format *fmt, char *msg, ...)
111 fprintf(stderr, "Warning at line %d: ", line_number);
114 vfprintf(stderr, args, msg);
120 static int next_line(void)
127 return !!line_count(&in_line);
130 *line_push(&in_line) = c;
134 static int csv_read(void)
139 int i = line_count(&in_line);
143 if (c < 0 || c == '\n') {
145 warn(in_format, "Missing closing quote.");
147 return !!fields_count(&in_fields);
152 if (c == in_format->quote) {
154 if (c != in_format->quote) {
158 // Two quotes assimilate to one
160 // Fall through to pushing the character
161 } else if (c == in_format->quote) {
164 } else if (c == in_format->fs && !quoted) {
170 *line_push(&in_line) = c;
175 static int is_ws(int c)
177 return (c == ' ' || c == '\t' || c == '\f');
180 static void csv_write(void)
182 unsigned char *line = line_first(&in_line);
183 int n = fields_count(&out_fields);
184 for (int i=0; i<n; i++) {
185 struct field *f = fields_nth(&out_fields, i);
187 if (out_format->quote >= 0) {
188 need_quotes = out_format->always_quote;
189 for (int j=0; !need_quotes && j < f->len; j++) {
190 int c = line[f->start_pos + j];
191 if (c == out_format->fs || c == out_format->quote)
196 putchar(out_format->fs);
198 putchar(out_format->quote);
199 for (int j=0; j < f->len; j++) {
200 int c = line[f->start_pos + j];
201 if (c == out_format->fs && !need_quotes)
202 warn(out_format, "Field separator found inside field and quoting is turned off.");
203 if (c == out_format->quote)
208 putchar(out_format->quote);
213 static int ws_read(void)
218 unsigned char *line = line_first(&in_line);
219 int n = line_count(&in_line);
225 for (int i=0; i<n; i++) {
231 if (!in_field->start_pos &&
233 !in_format->strict_ws)
234 in_field->start_pos = i;
243 if (ws && in_format->strict_ws)
248 static const char *regex_set(struct format *f, char *rx)
252 f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
256 f->pcre_extra = pcre_study(f->pcre, 0, &err);
263 static int regex_read(void)
268 unsigned char *c = line_first(&in_line);
269 int n = line_count(&in_line);
276 int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3);
278 if (sep != PCRE_ERROR_NOMATCH)
279 warn(in_format, "PCRE matching error %d", sep);
280 // No further occurrence of the separator: the rest is a single field
282 in_field->len = n - i;
286 in_field->len = ovec[0] - i;
293 static void trim_fields(void)
295 unsigned char *line = line_first(&in_line);
296 for (int i = 0; i < fields_count(&in_fields); i++) {
297 struct field *f = fields_nth(&in_fields, i);
298 while (f->len && is_ws(line[f->start_pos]))
299 f->start_pos++, f->len--;
300 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
305 /*** Field selection ***/
308 int first_field, last_field;
311 DECLARE_BUF(selectors, struct selector);
312 static selectors_t selectors;
314 static char *parse_selector(char *str)
316 char buf[strlen(str) + 1];
319 struct selector *s = selectors_push(&selectors);
320 char *sep = strchr(buf, '-');
323 s->first_field = atoi(buf);
324 s->last_field = atoi(sep);
326 s->first_field = s->last_field = atoi(buf);
331 static void finish_parse_selectors(void)
333 if (!selectors_count(&selectors))
337 static void select_fields(void)
339 for (int i = 0; i < selectors_count(&selectors); i++) {
340 struct selector *s = selectors_nth(&selectors, i);
341 int first = s->first_field;
344 int last = s->last_field;
346 last = fields_count(&in_fields);
347 for (int j = first; j <= last; j++) {
348 struct field *f = fields_push(&out_fields);
349 if (j >= 1 && j <= fields_count(&in_fields))
350 *f = *fields_nth(&in_fields, j-1);
352 f->start_pos = f->len = 0;
357 /*** Parsing of arguments ***/
359 static void usage(void)
362 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
365 -t, --tsv TAB-separated values (default)\n\
366 -c, --csv Comma-separated values\n\
367 -w, --ws Values separated by arbitrary whitespace\n\
368 -W, --strict-ws Like --ws, but recognize empty columns at start/end\n\
369 -r, --regex=<rx> Separator given by Perl regular expression (input only)\n\
371 Format parameters:\n\
372 -d, --fs=<char> Delimiter of fields\n\
373 -q, --quiet Do not show warnings\n\
374 --always-quote Put quotes around all fields (CSV output only)\n\
377 --trim Trim leading and trailing whitespaces in fields\n\
382 static void bad_args(const char *msg, ...)
387 fprintf(stderr, "xsv: ");
388 vfprintf(stderr, msg, args);
392 fprintf(stderr, "Try `xsv --help' for more information.\n");
396 static const char short_options[] = "cd:qr:twW";
404 static const struct option long_options[] = {
405 { "always-quote", 0, NULL, OPT_ALWAYS_QUOTE },
406 { "csv", 0, NULL, 'c' },
407 { "fs", 1, NULL, 'd' },
408 { "quiet", 0, NULL, 'q' },
409 { "regex", 1, NULL, 'r' },
410 { "strict-ws", 0, NULL, 'W' },
411 { "trim", 0, NULL, OPT_TRIM },
412 { "tsv", 0, NULL, 't' },
413 { "ws", 0, NULL, 'w' },
414 { "help", 0, NULL, OPT_HELP },
415 { NULL, 0, NULL, 0 },
418 static void set_format(int format_id)
420 struct format *f = xmalloc(sizeof(*f));
421 memset(f, 0, sizeof(*f));
428 f->read_line = csv_read;
429 f->write_line = csv_write;
434 f->read_line = csv_read;
435 f->write_line = csv_write;
440 f->read_line = ws_read;
441 f->write_line = csv_write;
444 f->read_line = regex_read;
450 else if (!out_format)
453 bad_args("At most two format may be given.");
456 static struct format *current_format(void)
462 set_format(FORM_TSV);
466 int main(int argc, char **argv)
472 while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
475 set_format(FORM_CSV);
479 current_format()->fs = optarg[0];
481 bad_args("No field delimiter given.");
484 current_format()->quiet = 1;
487 set_format(FORM_REGEX);
488 err = regex_set(current_format(), optarg);
490 bad_args("Error compiling regex: %s", err);
493 set_format(FORM_TSV);
500 current_format()->strict_ws = 1;
502 case OPT_ALWAYS_QUOTE:
503 if (current_format()->id != FORM_CSV)
504 bad_args("--always-quote makes sense only for CSV.");
505 current_format()->always_quote = 1;
518 out_format = in_format;
519 if (!in_format->read_line)
520 bad_args("Write-only format selected for input.");
521 if (!out_format->write_line)
522 bad_args("Read-only format selected for output.");
524 for (int i = optind; i < argc; i++) {
525 err = parse_selector(argv[i]);
529 finish_parse_selectors();
531 fields_init(&in_fields);
532 fields_init(&out_fields);
537 fields_reset(&in_fields);
538 line_reset(&in_line);
540 if (!in_format->read_line())
546 fields_reset(&out_fields);
549 out_format->write_line();