#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <stdarg.h>
#include <getopt.h>
+#include <pcre.h>
+
/*** Memory allocation ***/
static void *xmalloc(size_t bytes)
if (b->count >= b->max) name##_extend(b); \
return &b->start[b->count++]; \
} \
+ static inline type *name##_first(name##_t *b) { return b->start; } \
static inline type *name##_nth(name##_t *b, int n) { return &b->start[n]; } \
// end
FORM_TSV,
FORM_CSV,
FORM_WS,
+ FORM_REGEX,
};
struct format {
int quiet;
int (*read_line)(void);
void (*write_line)(void);
+ // CSV backend:
+ int always_quote;
+ // regex backend:
+ pcre *pcre;
+ pcre_extra *pcre_extra;
};
static struct format *in_format, *out_format;
new_field();
}
-static void warn(struct format *fmt, char *msg)
+static void warn(struct format *fmt, char *msg, ...)
+{
+ if (!fmt->quiet) {
+ fprintf(stderr, "Warning at line %d: ", line_number);
+ va_list args;
+ va_start(args, msg);
+ vfprintf(stderr, args, msg);
+ va_end(args);
+ fputc('\n', stderr);
+ }
+}
+
+static int next_line(void)
{
- if (!fmt->quiet)
- fprintf(stderr, "Warning at line %d: %s\n", line_number, msg);
+ for (;;) {
+ int c = getchar();
+ if (c == '\r')
+ continue;
+ if (c < 0)
+ return !!line_count(&in_line);
+ if (c == '\n')
+ return 1;
+ *line_push(&in_line) = c;
+ }
}
static int csv_read(void)
static void csv_write(void)
{
- unsigned char *line = line_nth(&in_line, 0);
+ unsigned char *line = line_first(&in_line);
int n = fields_count(&out_fields);
for (int i=0; i<n; i++) {
struct field *f = fields_nth(&out_fields, i);
int need_quotes = 0;
if (out_format->quote >= 0) {
- for (int j=0; j < f->len; j++) {
+ need_quotes = out_format->always_quote;
+ for (int j=0; !need_quotes && j < f->len; j++) {
int c = line[f->start_pos + j];
- if (c == out_format->fs || c == out_format->quote) {
+ if (c == out_format->fs || c == out_format->quote)
need_quotes = 1;
- break;
- }
}
}
if (i)
}
}
+static const char *regex_set(struct format *f, char *rx)
+{
+ const char *err;
+ int errpos;
+ f->pcre = pcre_compile(rx, PCRE_DOLLAR_ENDONLY, &err, &errpos, NULL);
+ if (!f->pcre)
+ return err;
+
+ f->pcre_extra = pcre_study(f->pcre, 0, &err);
+ if (!f->pcre_extra)
+ return err;
+
+ return NULL;
+}
+
+static int regex_read(void)
+{
+ if (!next_line())
+ return 0;
+
+ unsigned char *c = line_first(&in_line);
+ int n = line_count(&in_line);
+ if (!n)
+ return 1;
+
+ int i = 0;
+ for (;;) {
+ int ovec[3];
+ int sep = pcre_exec(in_format->pcre, in_format->pcre_extra, (char *) c, n, i, 0, ovec, 3);
+ if (sep < 0) {
+ if (sep != PCRE_ERROR_NOMATCH)
+ warn(in_format, "PCRE matching error %d", sep);
+ // No further occurrence of the separator: the rest is a single field
+ new_field();
+ in_field->start_pos = i;
+ in_field->len = n - i;
+ return 1;
+ }
+ new_field();
+ in_field->start_pos = i;
+ in_field->len = ovec[0] - i;
+ i = ovec[1];
+ }
+}
+
/*** Transforms ***/
static void trim_fields(void)
{
- unsigned char *line = line_nth(&in_line, 0);
+ unsigned char *line = line_first(&in_line);
for (int i = 0; i < fields_count(&in_fields); i++) {
struct field *f = fields_nth(&in_fields, i);
while (f->len && is_ws(line[f->start_pos]))
-t, --tsv TAB-separated values (default)\n\
-c, --csv Comma-separated values\n\
-w, --ws Values separated by arbitrary whitespace\n\
+-r, --regex=<rx> Separator given by Perl regular expression (input only)\n\
\n\
Format parameters:\n\
-d, --fs=<char> Delimiter of fields\n\
-q, --quiet Do not show warnings\n\
+ --always-quote Put quotes around all fields (CSV output only)\n\
\n\
Other options:\n\
--trim Trim leading and trailing whitespaces in fields\n\
exit(0);
}
-static void bad_args(char *msg)
+static void bad_args(const char *msg, ...)
{
- if (msg)
- fprintf(stderr, "xsv: %s\n", msg);
+ if (msg) {
+ va_list args;
+ va_start(args, msg);
+ fprintf(stderr, "xsv: ");
+ vfprintf(stderr, msg, args);
+ fputc('\n', stderr);
+ va_end(args);
+ }
fprintf(stderr, "Try `xsv --help' for more information.\n");
exit(1);
}
-static const char short_options[] = "cd:qtw";
+static const char short_options[] = "cd:qr:tw";
enum long_options {
OPT_HELP = 256,
- OPT_TRIM = 257,
+ OPT_TRIM,
+ OPT_ALWAYS_QUOTE,
};
static const struct option long_options[] = {
+ { "always-quote", 0, NULL, OPT_ALWAYS_QUOTE },
{ "csv", 0, NULL, 'c' },
{ "fs", 1, NULL, 'd' },
{ "quiet", 0, NULL, 'q' },
+ { "regex", 1, NULL, 'r' },
{ "trim", 0, NULL, OPT_TRIM },
{ "tsv", 0, NULL, 't' },
{ "ws", 0, NULL, 'w' },
f->read_line = ws_read;
f->write_line = csv_write;
break;
+ case FORM_REGEX:
+ f->read_line = regex_read;
+ break;
}
if (!in_format)
{
int opt;
int want_trim = 0;
+ const char *err;
while ((opt = getopt_long(argc, argv, short_options, long_options, NULL)) >= 0)
switch (opt) {
case 'q':
current_format()->quiet = 1;
break;
+ case 'r':
+ set_format(FORM_REGEX);
+ err = regex_set(current_format(), optarg);
+ if (err)
+ bad_args("Error compiling regex: %s", err);
+ break;
case 't':
set_format(FORM_TSV);
break;
case 'w':
set_format(FORM_WS);
break;
+ case OPT_ALWAYS_QUOTE:
+ if (current_format()->id != FORM_CSV)
+ bad_args("--always-quote makes sense only for CSV.");
+ current_format()->always_quote = 1;
+ break;
case OPT_HELP:
usage();
case OPT_TRIM:
current_format();
if (!out_format)
out_format = in_format;
+ if (!in_format->read_line)
+ bad_args("Write-only format selected for input.");
+ if (!out_format->write_line)
+ bad_args("Read-only format selected for output.");
for (int i = optind; i < argc; i++) {
- char *err = parse_selector(argv[i]);
+ err = parse_selector(argv[i]);
if (err)
bad_args(err);
}