]> mj.ucw.cz Git - xsv.git/blob - xsv.c
Added --trim
[xsv.git] / xsv.c
1 /*
2  *      A Swiss-Army Knife for CSV-like Files
3  *
4  *      (c) 2012 Martin Mares <mj@ucw.cz>
5  */
6
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <getopt.h>
11
12 /*** Memory allocation ***/
13
14 static void *xmalloc(size_t bytes)
15 {
16         void *p = malloc(bytes);
17         if (!p) {
18                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
19                 exit(1);
20         }
21         return p;
22 }
23
24 static void *xrealloc(void *old, size_t bytes)
25 {
26         void *p = realloc(old, bytes);
27         if (!p) {
28                 fprintf(stderr, "xsv: Out of memory (cannot allocate %zu bytes)\n", bytes);
29                 exit(1);
30         }
31         return p;
32 }
33
34 #define DECLARE_BUF(name, type) \
35         typedef struct { type *start; int count; int max; } name##_t;                           \
36         static inline void name##_init(name##_t *b) { b->start = NULL; b->count = b->max = 0; } \
37         static inline void name##_reset(name##_t *b) { b->count = 0; }                          \
38         static inline int name##_count(name##_t *b) { return b->count; }                        \
39         static void name##_extend(name##_t *b) {                                                \
40                 b->max = b->max ? 2*b->max : 16;                                                \
41                 b->start = xrealloc(b->start, b->max * sizeof(type));                           \
42         }                                                                                       \
43         static inline type *name##_push(name##_t *b) {                                          \
44                 if (b->count >= b->max) name##_extend(b);                                       \
45                 return &b->start[b->count++];                                                   \
46         }                                                                                       \
47         static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; }             \
48         // end
49
50 /*** Formats and their parameters ***/
51
52 enum format_id {
53         FORM_UNSPEC,
54         FORM_TSV,
55         FORM_CSV,
56         FORM_WS,
57 };
58
59 struct format {
60         enum format_id id;
61         int fs;
62         int quote;
63         int (*read_line)(void);
64         void (*write_line)(void);
65 };
66
67 static struct format *in_format, *out_format;
68
69 struct field {
70         int start_pos;
71         int len;
72 };
73
74 DECLARE_BUF(fields, struct field);
75 DECLARE_BUF(line, unsigned char);
76
77 static fields_t in_fields, out_fields;
78 static struct field *in_field;
79 static line_t in_line;
80
81 static void new_field(void)
82 {
83         in_field = fields_push(&in_fields);
84         in_field->start_pos = line_count(&in_line);
85         in_field->len = 0;
86 }
87
88 static void ensure_field(void)
89 {
90         if (!in_field)
91                 new_field();
92 }
93
94 static int csv_read(void)
95 {
96         int quoted = 0;
97         // FIXME: Complain if closing quote is missing?
98         for (;;) {
99                 int c = getchar();
100 restart:
101                 if (c < 0)
102                         return !!fields_count(&in_fields);
103                 if (c == '\r')
104                         continue;
105                 if (c == '\n')
106                         return 1;
107                 if (quoted) {
108                         if (c == in_format->quote) {
109                                 c = getchar();
110                                 if (c != in_format->quote) {
111                                         quoted = 0;
112                                         goto restart;
113                                 }
114                                 // Two quotes assimilate to one
115                         }
116                         // Fall through to pushing the character
117                 } else if (c == in_format->quote) {
118                         quoted = 1;
119                         continue;
120                 } else if (c == in_format->fs && !quoted) {
121                         ensure_field();
122                         new_field();
123                         continue;
124                 }
125                 ensure_field();
126                 *line_push(&in_line) = c;
127                 in_field->len++;
128         }
129 }
130
131 static int is_ws(int c)
132 {
133         return (c == ' ' || c == '\t' || c == '\f');
134 }
135
136 static void csv_write(void)
137 {
138         unsigned char *line = line_nth(&in_line, 0);
139         int n = fields_count(&out_fields);
140         for (int i=0; i<n; i++) {
141                 struct field *f = fields_nth(&out_fields, i);
142                 int need_quotes = 0;
143                 if (out_format->quote >= 0) {
144                         for (int j=0; j < f->len; j++) {
145                                 int c = line[f->start_pos + j];
146                                 if (c == out_format->fs || c == out_format->quote) {
147                                         need_quotes = 1;
148                                         break;
149                                 }
150                         }
151                 }
152                 if (i)
153                         putchar(out_format->fs);
154                 if (need_quotes)
155                         putchar(out_format->quote);
156                 for (int j=0; j < f->len; j++) {
157                         int c = line[f->start_pos + j];
158                         if (c == out_format->quote)
159                                 putchar(c);
160                         putchar(c);
161                 }
162                 if (need_quotes)
163                         putchar(out_format->quote);
164         }
165         putchar('\n');
166 }
167
168 static int ws_read(void)
169 {
170         int ws = 0;
171         for (;;) {
172                 int c = getchar();
173                 if (c < 0)
174                         return !!fields_count(&in_fields);
175                 if (c == '\r')
176                         continue;
177                 if (c == '\n')
178                         return 1;
179                 if (is_ws(c)) {
180                         ensure_field();
181                         if (!ws)
182                                 new_field();
183                         ws++;
184                 } else {
185                         ensure_field();
186                         *line_push(&in_line) = c;
187                         in_field->len++;
188                         ws = 0;
189                 }
190         }
191 }
192
193 /*** Transforms ***/
194
195 static void trim_fields(void)
196 {
197         unsigned char *line = line_nth(&in_line, 0);
198         for (int i = 0; i < fields_count(&in_fields); i++) {
199                 struct field *f = fields_nth(&in_fields, i);
200                 while (f->len && is_ws(line[f->start_pos]))
201                         f->start_pos++, f->len--;
202                 while (f->len && is_ws(line[f->start_pos + f->len - 1]))
203                         f->len--;
204         }
205 }
206
207 /*** Field selection ***/
208
209 struct selector {
210         int first_field, last_field;
211 };
212
213 DECLARE_BUF(selectors, struct selector);
214 static selectors_t selectors;
215
216 static char *parse_selector(char *str)
217 {
218         char buf[strlen(str) + 1];
219         strcpy(buf, str);
220
221         struct selector *s = selectors_push(&selectors);
222         char *sep = strchr(buf, '-');
223         if (sep) {
224                 *sep++ = 0;
225                 s->first_field = atoi(buf);
226                 s->last_field = atoi(sep);
227         } else
228                 s->first_field = s->last_field = atoi(buf);
229
230         return NULL;
231 }
232
233 static void finish_parse_selectors(void)
234 {
235         if (!selectors_count(&selectors))
236                 parse_selector("-");
237 }
238
239 static void select_fields(void)
240 {
241         for (int i = 0; i < selectors_count(&selectors); i++) {
242                 struct selector *s = selectors_nth(&selectors, i);
243                 int first = s->first_field;
244                 if (first <= 0)
245                         first = 1;
246                 int last = s->last_field;
247                 if (last <= 0)
248                         last = fields_count(&in_fields);
249                 for (int j = first; j <= last; j++) {
250                         struct field *f = fields_push(&out_fields);
251                         if (j >= 1 && j <= fields_count(&in_fields))
252                                 *f = *fields_nth(&in_fields, j-1);
253                         else
254                                 f->start_pos = f->len = 0;
255                 }
256         }
257 }
258
259 /*** Parsing of arguments ***/
260
261 static void usage(void)
262 {
263         printf("\
264 Usage: xsv <in-format> [<out-format>] <options> [<fields>]\n\
265 \n\
266 Formats:\n\
267 -t, --tsv               TAB-separated values (default)\n\
268 -c, --csv               Comma-separated values\n\
269 -w, --ws                Values separated by arbitrary whitespace\n\
270 \n\
271 Format parameters:\n\
272 -d, --fs=<char>         Delimiter of fields\n\
273 \n\
274 Other options:\n\
275     --trim              Trim leading and trailing whitespaces in fields\n\
276 ");
277         exit(0);
278 }
279
280 static void bad_args(char *msg)
281 {
282         if (msg)
283                 fprintf(stderr, "xsv: %s\n", msg);
284         fprintf(stderr, "Try `xsv --help' for more information.\n");
285         exit(1);
286 }
287
288 static const char short_options[] = "cd:tw";
289
290 enum long_options {
291         OPT_HELP = 256,
292         OPT_TRIM = 257,
293 };
294
295 static const struct option long_options[] = {
296         { "csv",                0,      NULL,   'c' },
297         { "fs",                 1,      NULL,   'd' },
298         { "trim",               0,      NULL,   OPT_TRIM },
299         { "tsv",                0,      NULL,   't' },
300         { "ws",                 0,      NULL,   'w' },
301         { "help",               0,      NULL,   OPT_HELP },
302         { NULL,                 0,      NULL,   0 },
303 };
304
305 static void set_format(int format_id)
306 {
307         struct format *f = xmalloc(sizeof(*f));
308         memset(f, 0, sizeof(*f));
309         f->id = format_id;
310
311         switch (format_id) {
312                 case FORM_TSV:
313                         f->fs = '\t';
314                         f->quote = -1;
315                         f->read_line = csv_read;
316                         f->write_line = csv_write;
317                         break;
318                 case FORM_CSV:
319                         f->fs = ',';
320                         f->quote = '"';
321                         f->read_line = csv_read;
322                         f->write_line = csv_write;
323                         break;
324                 case FORM_WS:
325                         f->fs = ' ';
326                         f->quote = -1;
327                         f->read_line = ws_read;
328                         f->write_line = csv_write;
329                         break;
330         }
331
332         if (!in_format)
333                 in_format = f;
334         else if (!out_format)
335                 out_format = f;
336         else
337                 bad_args("At most two format may be given.");
338 }
339
340 static struct format *current_format(void)
341 {
342         if (out_format)
343                 return out_format;
344         if (in_format)
345                 return in_format;
346         set_format(FORM_TSV);
347         return in_format;
348 }
349
350 int main(int argc, char **argv)
351 {
352         int opt;
353         int want_trim = 0;
354
355         while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
356                 switch (opt) {
357                         case 'c':
358                                 set_format(FORM_CSV);
359                                 break;
360                         case 'd':
361                                 if (optarg[0])
362                                         current_format()->fs = optarg[0];
363                                 else
364                                         bad_args("No field delimiter given.");
365                                 break;
366                         case 't':
367                                 set_format(FORM_TSV);
368                                 break;
369                         case 'w':
370                                 set_format(FORM_WS);
371                                 break;
372                         case OPT_HELP:
373                                 usage();
374                         case OPT_TRIM:
375                                 want_trim = 1;
376                                 break;
377                         default:
378                                 bad_args(NULL);
379                 }
380
381         current_format();
382         if (!out_format)
383                 out_format = in_format;
384
385         for (int i = optind; i < argc; i++) {
386                 char *err = parse_selector(argv[i]);
387                 if (err)
388                         bad_args(err);
389         }
390         finish_parse_selectors();
391
392         fields_init(&in_fields);
393         fields_init(&out_fields);
394         line_init(&in_line);
395
396         for (;;) {
397                 fields_reset(&in_fields);
398                 line_reset(&in_line);
399                 in_field = NULL;
400                 if (!in_format->read_line())
401                         break;
402
403                 if (want_trim)
404                         trim_fields();
405
406                 fields_reset(&out_fields);
407                 select_fields();
408
409                 out_format->write_line();
410         }
411
412         return 0;
413 }