From 42b9646b9fe4ef07516ea8d3a06bc6b18c522b4d Mon Sep 17 00:00:00 2001 From: Martin Mares Date: Mon, 17 Jun 2013 21:48:22 +0200 Subject: [PATCH] Better analysis of HTTP caching headers --- README | 16 ++- TODO | 1 + netgrind/http.c | 327 +++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 308 insertions(+), 36 deletions(-) diff --git a/README b/README index 97f48e1..71a02dd 100644 --- a/README +++ b/README @@ -58,7 +58,7 @@ global transaction ID cache status --+ | | duration [sec] | request URI | source address:port destination addr:port | | | | | | | | | | | | | | | -000001 2003-06-06 22:53:38.642 81.27.194.19:1175 205.217.153.53:80 123.123.123.123 200 ... 0 14030 0.957 0.444 GET http://... +000001 2003-06-06 22:53:38.642 81.27.194.19:1175 205.217.153.53:80 123.123.123.123 200 .... 0 14030 0.957 0.444 GET http://... | | | date and time of request start X-Forwarded-For HTTP response code or (if it's the first request on a connection, ERR = HTTP violation @@ -71,12 +71,24 @@ Cache status codes: 1st char: request cacheing control . = none N = don't cache R = revalidate + 2nd char: response cacheability . = not restricted (first matching letter used) N = don't cache P = private S = don't store to non-volatile storage + X = already expired + V = "Vary:" header present R = must be revalidated - 3rd char: cache response . = X-Cache not found + E = short expiration time (less than 5min) + L = long expiration time (5min or more) + lowercase version = but cached by a HTTP/1.0 cache + + 3rd char: response validator . = none + E = ETag + W = weak ETag + L = Last-Modified + + 4th char: cache response . = X-Cache not found (X-Cache header, optional) - = MISS + = HIT ? = other diff --git a/TODO b/TODO index bb690c6..f455e77 100644 --- a/TODO +++ b/TODO @@ -4,3 +4,4 @@ Various ideas: - decoding SOCKS - data type recognition ? - count extra data between IP end and eth end +* postprocessors have to be updated after recent changes of cache status flags diff --git a/netgrind/http.c b/netgrind/http.c index 7cd4026..d65672c 100644 --- a/netgrind/http.c +++ b/netgrind/http.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #define MAXLINE 4096 @@ -112,6 +113,37 @@ static uns find_token(byte *hay, byte *needle) return 0; } +static byte *find_token_val(byte *hay, byte *needle) +{ + if (!hay) + return NULL; + while (*hay) + { + if (*hay == ' ' || *hay == '\t' || *hay == ',') + hay++; + else + { + byte *h = hay; + while (*hay && *hay != ',' && *hay != ' ' && *hay != '\t' && *hay != '=') + hay++; + if (*hay == '=') + { + uns old = *hay; + *hay = 0; + uns found = !strcasecmp(h, needle); + *hay = old; + if (found) + return hay + 1; + } + while (*hay && *hay != ',' && *hay != ' ' && *hay != '\t') + hay++; + } + } + return NULL; +} + +/* Logging */ + static void http_log_start(struct http_state *s) { if (!http_log_dir) @@ -157,6 +189,260 @@ static void http_log_body(struct http_state *s, byte *data, uns len) fwrite(data, len, 1, s->log_file); } +/* Date parsing */ + +static int +validate_time(byte *a, byte *p) +{ + while (*p && *a) + { + switch (*p) + { + case '.': + break; + case '$': + if (*a == ' ') + break; + /* Fall-thru */ + case '#': + if (*a < '0' || *a > '9') + return 0; + break; + case 'a': + if (*a < 'a' || *a > 'z') + return 0; + break; + case 'A': + if (*a < 'A' || *a > 'Z') + return 0; + break; + case '@': + if ((*a < 'a' && *a > 'z') && (*a < 'A' && *a > 'Z')) + return 0; + break; + case '*': + return 1; + case '_': + if (*a != ' ' && *a != '-') + return 0; + break; + default: + if (*p != *a) + return 0; + } + p++; + a++; + } + return (*p == *a || *p == '*'); +} + +static int +twodig(byte *p) +{ + if (p[0] == ' ') + return p[1] - '0'; + else + return (p[0] - '0')*10 + p[1] - '0'; +} + +static int +fourdig(byte *p) +{ + return twodig(p)*100 + twodig(p+2); +} + +static byte short_months[] = "JanFebMarAprMayJunJulAugSepOctNovDec"; + +static int +find_month(byte *p) +{ + byte *z = short_months; + int m = 0; + + while (*z) + { + m++; + if (p[0] == z[0] && p[1] == z[1] && p[2] == z[2]) + return m; + z += 3; + } + return 0; +} + +static s64 http_parse_date(char *date) +{ + char *p = date; + char *q; + struct tm tm; + int m, y; + + if (!date) + return -1; + + bzero(&tm, sizeof(tm)); + if (validate_time(p, "Aaa, ##_Aaa_#### ##:##:##*")) + { /* RFC 822/1123 */ + tm.tm_mday = twodig(p+5); + m = find_month(p+8); + if (!m) + goto unk; + tm.tm_mon = m - 1; + y = fourdig(p+12); + if (y < 1970) + goto unk; + tm.tm_year = y - 1900; + tm.tm_hour = twodig(p+17); + tm.tm_min = twodig(p+20); + tm.tm_sec = twodig(p+23); + goto ook; + } + if (q = strchr(p, ',')) + { + int flag = 0; + if (validate_time(q, ", ##-Aaa-## ##:##:## GMT")) /* RFC 850 */ + flag = 1; + else if (validate_time(q, ", #-Aaa-## ##:##:## GMT")) /* Incorrectly implemented RFC 850 */ + flag = 2; + if (flag) + { + tm.tm_mday = twodig(q+2); + if (flag == 2) + q--; + m = find_month(q+5); + if (!m) + goto unk; + tm.tm_mon = m - 1; + tm.tm_year = twodig(q+9); + if (tm.tm_year < 76) + tm.tm_year += 100; + tm.tm_hour = twodig(q+12); + tm.tm_min = twodig(q+15); + tm.tm_sec = twodig(q+18); + goto ook; + } + } + if (validate_time(p, "Aaa Aaa $# ##:##:## ####")) + { /* ANSI C asctime() */ + m = find_month(p+4); + if (!m) + goto unk; + tm.tm_mon = m - 1; + tm.tm_mday = twodig(p+8); + tm.tm_hour = twodig(p+11); + tm.tm_min = twodig(p+14); + tm.tm_sec = twodig(p+17); + y = fourdig(p+20); + if (y < 1980) + goto unk; + tm.tm_year = y - 1900; + goto ook; + } + if (!strcmp(p, "0") || !strcmp(p, "-1")) + return 0; /* Porcine hacks */ + +unk: + fprintf(stderr, "Unable to parse date `%s'\n", date); + return 0; + +ook: + m = timegm(&tm); + if (m == (time_t) -1) + goto unk; + return (u64) m * 1000000; +} + +static void http_cache_report(struct http_state *s, char *buf) +{ + byte *rq_pragma = http_lookup_hdr(&s->req_headers, "Pragma:"); + byte *rp_pragma = http_lookup_hdr(&s->resp_headers, "Pragma:"); + byte *rq_cc = http_lookup_hdr(&s->req_headers, "Cache-control:"); + byte *rp_cc = http_lookup_hdr(&s->resp_headers, "Cache-control:"); + byte *rp_vary = http_lookup_hdr(&s->resp_headers, "Vary:"); + byte *rp_cache = http_lookup_hdr(&s->resp_headers, "X-Cache:"); + s64 rp_expires = http_parse_date(http_lookup_hdr(&s->resp_headers, "Expires:")); + s64 rp_date = http_parse_date(http_lookup_hdr(&s->resp_headers, "Date:")); + s64 rp_time = rp_date > 0 ? rp_date : s->resp_line ? (s64) s->resp_start_time : (s64) s->req_start_time; + + // Cache control in request + if (find_token(rq_pragma, "no-cache") || find_token(rq_cc, "no-cache")) + buf[0] = 'N'; + else if (find_token(rq_cc, "max-age=0") || find_token(rq_cc, "must-revalidate")) + buf[0] = 'R'; + else + buf[0] = '.'; + + // HTTP/1.0 cache control in reply + uns expired10 = (rp_expires > 0 && rp_expires <= rp_time); + uns nocache10 = find_token(rp_pragma, "no-cache"); + + // Expiration of reply + byte *rp_maxage_arg = find_token_val(rp_cc, "max-age"); + s64 rp_maxage; + if (rp_maxage_arg) + rp_maxage = (s64) atoi(rp_maxage_arg) * 1000000; + else if (expired10) + rp_maxage = 0; + else if (rp_expires >= rp_time) + rp_maxage = rp_expires - rp_time; + else + rp_maxage = -1; +#if 0 + if (s->log_file) + fprintf(s->log_file, "; rp_expires=%Ld rp_time=%Ld rp_maxage=%Ld expired10=%d nocache10=%d\n", rp_expires, rp_time, rp_maxage, expired10, nocache10); +#endif + + // Cache control in reply + if (nocache10 || find_token(rp_cc, "no-cache")) + buf[1] = 'N'; + else if (find_token(rp_cc, "private")) + buf[1] = 'P'; + else if (find_token(rp_cc, "no-store")) + buf[1] = 'S'; + else if (expired10 || !rp_maxage) + buf[1] = 'X'; + else if (rp_vary) + buf[1] = 'V'; + else if (find_token(rp_cc, "must-revalidate")) + buf[1] = 'R'; + else if (rp_maxage > 0 && rp_maxage < (s64) 300 * 1000000) + buf[1] = 'E'; + else if (rp_maxage > 0) + buf[1] = 'L'; + else + buf[1] = '.'; + + // Do HTTP/1.1 and HTTP/1.0 behaviour match? + if (buf[1] != '.' && buf[1] != 'E' && buf[1] != 'L' && !expired10 && !nocache10) + buf[1] |= 0x20; // Lowercase + + // Validators in reply + byte *rp_etag = http_lookup_hdr(&s->resp_headers, "ETag:"); + s64 rp_lastmod = http_parse_date(http_lookup_hdr(&s->resp_headers, "Last-Modified:")); + if (rp_etag) + { + if (rp_etag[0] == 'W' && rp_etag[1] == '/') + buf[2] = 'W'; + else + buf[2] = 'E'; + } + else if (rp_lastmod > 0) + buf[2] = 'L'; + else + buf[2] = '.'; + + // Is there cache status in reply? + if (!rp_cache) + buf[3] = '.'; + else if (!strncmp(rp_cache, "HIT ", 4)) + buf[3] = '+'; + else if (!strncmp(rp_cache, "MISS ", 5)) + buf[3] = '-'; + else + buf[3] = '?'; + + buf[4] = 0; +} + static void http_report(struct flow *f, struct http_state *s, u64 when, byte *reason) { byte *method, *url, *x, *y, *stat; @@ -207,37 +493,10 @@ static void http_report(struct flow *f, struct http_state *s, u64 when, byte *re char *ffor = http_lookup_hdr(&s->req_headers, "X-Forwarded-For:"); /* Find out cacheability */ - byte *rq_pragma = http_lookup_hdr(&s->req_headers, "Pragma:"); - byte *rp_pragma = http_lookup_hdr(&s->resp_headers, "Pragma:"); - byte *rq_cc = http_lookup_hdr(&s->req_headers, "Cache-control:"); - byte *rp_cc = http_lookup_hdr(&s->resp_headers, "Cache-control:"); - byte *rp_cache = http_lookup_hdr(&s->resp_headers, "X-Cache:"); - uns rq_cflag, rp_cflag, rp_hit; - if (find_token(rq_pragma, "no-cache") || find_token(rq_cc, "no-cache")) - rq_cflag = 'N'; - else if (find_token(rq_cc, "max-age=0") || find_token(rq_cc, "must-revalidate")) - rq_cflag = 'R'; - else - rq_cflag = '.'; - if (find_token(rp_pragma, "no-cache") || find_token(rp_cc, "no-cache")) - rp_cflag = 'N'; - else if (find_token(rp_cc, "private")) - rp_cflag = 'P'; - else if (find_token(rp_cc, "no-store")) - rp_cflag = 'S'; - else if (find_token(rp_cc, "must-revalidate")) - rp_cflag = 'R'; - else - rp_cflag = '.'; - if (!rp_cache) - rp_hit = '.'; - else if (!strncmp(rp_cache, "HIT ", 4)) - rp_hit = '+'; - else if (!strncmp(rp_cache, "MISS ", 5)) - rp_hit = '-'; - else - rp_hit = '?'; + char cache_flags[16]; + http_cache_report(s, cache_flags); + /* Format log message */ byte stamp[TIMESTAMP_LEN], src[22], dst[22]; sprintf(src, "%d.%d.%d.%d:%d", IPQUAD(f->saddr), ntohs(f->sport)); sprintf(dst, "%d.%d.%d.%d:%d", IPQUAD(f->daddr), ntohs(f->dport)); @@ -249,11 +508,11 @@ static void http_report(struct flow *f, struct http_state *s, u64 when, byte *re if (sep = strchr(ctype, ';')) *sep = 0; if (!s->xact_id) - printf("# id timestamp source destination forwarded-for res cac que length total time wait time ctype method URL\n"); - /* 000000 2003-06-06 22:53:38.642 81.27.194.19:1175 205.217.153.53:80 123.123.123.123 200 ... 0 14030 0.957 0.444 text/plain GET http://... */ - printf("%06u %s %-21s %-21s %-15s %-3s %c%c%c %3d %8d %6d.%03d %6d.%03d %-12s %s %s\n", + printf("# id timestamp source destination forwarded-for res cach que length total time wait time ctype method URL\n"); + /* 000000 2003-06-06 22:53:38.642 81.27.194.19:1175 205.217.153.53:80 123.123.123.123 200 .... 0 14030 0.957 0.444 text/plain GET http://... */ + printf("%06u %s %-21s %-21s %-15s %-3s %s %3d %8d %6d.%03d %6d.%03d %-12s %s %s\n", s->xact_id, stamp, src, dst, (ffor ? : "-"), reason, - rq_cflag, rp_cflag, rp_hit, + cache_flags, s->req_counter, s->body_total_size, (uns)(ttotal/1000000), (uns)(ttotal%1000000)/1000, -- 2.39.5