2 * A Process Isolator based in Linux Containers
4 * (c) 2012 Martin Mares <mj@ucw.cz>
25 #include <sys/ptrace.h>
26 #include <sys/signal.h>
27 #include <sys/sysinfo.h>
28 #include <sys/resource.h>
30 #define NONRET __attribute__((noreturn))
31 #define UNUSED __attribute__((unused))
32 #define ARRAY_SIZE(a) (int)(sizeof(a)/sizeof(a[0]))
34 static int timeout; /* milliseconds */
35 static int wall_timeout;
36 static int extra_timeout;
37 static int pass_environ;
39 static int memory_limit;
40 static int stack_limit;
41 static char *redir_stdin, *redir_stdout, *redir_stderr;
45 static volatile int timer_tick;
46 static struct timeval start_time;
47 static int ticks_per_sec;
48 static int partial_line;
50 static int mem_peak_kb;
51 static int total_ms, wall_ms;
53 static void die(char *msg, ...) NONRET;
54 static void sample_mem_peak(void);
58 static FILE *metafile;
61 meta_open(const char *name)
63 if (!strcmp(name, "-"))
68 metafile = fopen(name, "w");
70 die("Failed to open metafile '%s'",name);
76 if (metafile && metafile != stdout)
80 static void __attribute__((format(printf,1,2)))
81 meta_printf(const char *fmt, ...)
88 vfprintf(metafile, fmt, args);
93 final_stats(struct rusage *rus)
95 struct timeval total, now, wall;
96 timeradd(&rus->ru_utime, &rus->ru_stime, &total);
97 total_ms = total.tv_sec*1000 + total.tv_usec/1000;
98 gettimeofday(&now, NULL);
99 timersub(&now, &start_time, &wall);
100 wall_ms = wall.tv_sec*1000 + wall.tv_usec/1000;
102 meta_printf("time:%d.%03d\n", total_ms/1000, total_ms%1000);
103 meta_printf("time-wall:%d.%03d\n", wall_ms/1000, wall_ms%1000);
104 meta_printf("mem:%llu\n", (unsigned long long) mem_peak_kb * 1024);
107 /*** Messages and exits ***/
115 kill(-box_pid, SIGKILL);
116 kill(box_pid, SIGKILL);
117 meta_printf("killed:1\n");
122 p = wait4(box_pid, &stat, 0, &rus);
123 while (p < 0 && errno == EINTR);
125 fprintf(stderr, "UGH: Lost track of the process (%m)\n");
141 /* Report an error of the sandbox itself */
142 static void NONRET __attribute__((format(printf,1,2)))
149 vsnprintf(buf, sizeof(buf), msg, args);
150 meta_printf("status:XX\nmessage:%s\n", buf);
156 /* Report an error of the program inside the sandbox */
157 static void NONRET __attribute__((format(printf,1,2)))
163 if (msg[0] && msg[1] && msg[2] == ':' && msg[3] == ' ')
165 meta_printf("status:%c%c\n", msg[0], msg[1]);
169 vsnprintf(buf, sizeof(buf), msg, args);
170 meta_printf("message:%s\n", buf);
176 /* Write a message, but only if in verbose mode */
177 static void __attribute__((format(printf,1,2)))
184 int len = strlen(msg);
186 partial_line = (msg[len-1] != '\n');
187 vfprintf(stderr, msg, args);
196 void *p = malloc(size);
198 die("Out of memory");
202 /*** Environment rules ***/
205 char *var; // Variable to match
206 char *val; // ""=clear, NULL=inherit
208 struct env_rule *next;
211 static struct env_rule *first_env_rule;
212 static struct env_rule **last_env_rule = &first_env_rule;
214 static struct env_rule default_env_rules[] = {
215 { "LIBC_FATAL_STDERR_", "1" }
219 set_env_action(char *a0)
221 struct env_rule *r = xmalloc(sizeof(*r) + strlen(a0) + 1);
222 char *a = (char *)(r+1);
225 char *sep = strchr(a, '=');
237 last_env_rule = &r->next;
243 match_env_var(char *env_entry, struct env_rule *r)
245 if (strncmp(env_entry, r->var, r->var_len))
247 return (env_entry[r->var_len] == '=');
251 apply_env_rule(char **env, int *env_sizep, struct env_rule *r)
253 // First remove the variable if already set
255 while (pos < *env_sizep && !match_env_var(env[pos], r))
257 if (pos < *env_sizep)
260 env[pos] = env[*env_sizep];
261 env[*env_sizep] = NULL;
264 // What is the new value?
270 new = xmalloc(r->var_len + 1 + strlen(r->val) + 1);
271 sprintf(new, "%s=%s", r->var, r->val);
276 while (environ[pos] && !match_env_var(environ[pos], r))
278 if (!(new = environ[pos]))
282 // Add it at the end of the array
283 env[(*env_sizep)++] = new;
284 env[*env_sizep] = NULL;
288 setup_environment(void)
290 // Link built-in rules with user rules
291 for (int i=ARRAY_SIZE(default_env_rules)-1; i >= 0; i--)
293 default_env_rules[i].next = first_env_rule;
294 first_env_rule = &default_env_rules[i];
297 // Scan the original environment
298 char **orig_env = environ;
300 while (orig_env[orig_size])
303 // For each rule, reserve one more slot and calculate length
305 for (struct env_rule *r = first_env_rule; r; r=r->next)
308 r->var_len = strlen(r->var);
311 // Create a new environment
312 char **env = xmalloc((orig_size + num_rules + 1) * sizeof(char *));
316 memcpy(env, environ, orig_size * sizeof(char *));
323 // Apply the rules one by one
324 for (struct env_rule *r = first_env_rule; r; r=r->next)
325 apply_env_rule(env, &size, r);
327 // Return the new env and pass some gossip
330 fprintf(stderr, "Passing environment:\n");
331 for (int i=0; env[i]; i++)
332 fprintf(stderr, "\t%s\n", env[i]);
340 signal_alarm(int unused UNUSED)
342 /* Time limit checks are synchronous, so we only schedule them there. */
348 signal_int(int unused UNUSED)
350 /* Interrupts are fatal, so no synchronization requirements. */
351 meta_printf("exitsig:%d\n", SIGINT);
352 err("SG: Interrupted");
355 #define PROC_BUF_SIZE 4096
357 read_proc_file(char *buf, char *name, int *fdp)
363 sprintf(buf, "/proc/%d/%s", (int) box_pid, name);
364 *fdp = open(buf, O_RDONLY);
366 die("open(%s): %m", buf);
368 lseek(*fdp, 0, SEEK_SET);
369 if ((c = read(*fdp, buf, PROC_BUF_SIZE-1)) < 0)
370 die("read on /proc/$pid/%s: %m", name);
371 if (c >= PROC_BUF_SIZE-1)
372 die("/proc/$pid/%s too long", name);
381 struct timeval now, wall;
383 gettimeofday(&now, NULL);
384 timersub(&now, &start_time, &wall);
385 wall_ms = wall.tv_sec*1000 + wall.tv_usec/1000;
386 if (wall_ms > wall_timeout)
387 err("TO: Time limit exceeded (wall clock)");
389 fprintf(stderr, "[wall time check: %d msec]\n", wall_ms);
393 char buf[PROC_BUF_SIZE], *x;
394 int utime, stime, ms;
395 static int proc_stat_fd;
396 read_proc_file(buf, "stat", &proc_stat_fd);
398 while (*x && *x != ' ')
403 die("proc stat syntax error 1");
404 while (*x && (*x != ')' || x[1] != ' '))
406 while (*x == ')' || *x == ' ')
408 if (sscanf(x, "%*c %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %d %d", &utime, &stime) != 2)
409 die("proc stat syntax error 2");
410 ms = (utime + stime) * 1000 / ticks_per_sec;
412 fprintf(stderr, "[time check: %d msec]\n", ms);
413 if (ms > timeout && ms > extra_timeout)
414 err("TO: Time limit exceeded");
419 sample_mem_peak(void)
422 * We want to find out the peak memory usage of the process, which is
423 * maintained by the kernel, but unforunately it gets lost when the
424 * process exits (it is not reported in struct rusage). Therefore we
425 * have to sample it whenever we suspect that the process is about
428 char buf[PROC_BUF_SIZE], *x;
429 static int proc_status_fd;
430 read_proc_file(buf, "status", &proc_status_fd);
436 while (*x && *x != ':' && *x != '\n')
438 if (!*x || *x == '\n')
441 while (*x == ' ' || *x == '\t')
445 while (*x && *x != '\n')
451 if (!strcmp(key, "VmPeak"))
453 int peak = atoi(val);
454 if (peak > mem_peak_kb)
460 msg("[mem-peak: %u KB]\n", mem_peak_kb);
468 bzero(&sa, sizeof(sa));
469 sa.sa_handler = signal_int;
470 sigaction(SIGINT, &sa, NULL);
472 gettimeofday(&start_time, NULL);
473 ticks_per_sec = sysconf(_SC_CLK_TCK);
474 if (ticks_per_sec <= 0)
475 die("Invalid ticks_per_sec!");
477 if (timeout || wall_timeout)
479 sa.sa_handler = signal_alarm;
480 sigaction(SIGALRM, &sa, NULL);
494 p = wait4(box_pid, &stat, WUNTRACED, &rus);
502 die("wait4: unknown pid %d exited!", p);
507 if (WEXITSTATUS(stat))
509 // FIXME: Recognize internal errors during setup
510 meta_printf("exitcode:%d\n", WEXITSTATUS(stat));
511 err("RE: Exited with error status %d", WEXITSTATUS(stat));
513 if (timeout && total_ms > timeout)
514 err("TO: Time limit exceeded");
515 if (wall_timeout && wall_ms > wall_timeout)
516 err("TO: Time limit exceeded (wall clock)");
518 fprintf(stderr, "OK (%d.%03d sec real, %d.%03d sec wall, %d MB)\n",
519 total_ms/1000, total_ms%1000,
520 wall_ms/1000, wall_ms%1000,
521 (mem_peak_kb + 1023) / 1024);
524 if (WIFSIGNALED(stat))
527 meta_printf("exitsig:%d\n", WTERMSIG(stat));
529 err("SG: Caught fatal signal %d", WTERMSIG(stat));
531 if (WIFSTOPPED(stat))
534 meta_printf("exitsig:%d\n", WSTOPSIG(stat));
536 err("SG: Stopped by signal %d", WSTOPSIG(stat));
539 die("wait4: unknown status %x, giving up!", stat);
544 box_inside(int argc, char **argv)
549 memcpy(args, argv, argc * sizeof(char *));
551 if (set_cwd && chdir(set_cwd))
556 if (open(redir_stdin, O_RDONLY) != 0)
557 die("open(\"%s\"): %m", redir_stdin);
562 if (open(redir_stdout, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 1)
563 die("open(\"%s\"): %m", redir_stdout);
568 if (open(redir_stderr, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 2)
569 die("open(\"%s\"): %m", redir_stderr);
577 rl.rlim_cur = rl.rlim_max = memory_limit * 1024;
578 if (setrlimit(RLIMIT_AS, &rl) < 0)
579 die("setrlimit(RLIMIT_AS): %m");
582 rl.rlim_cur = rl.rlim_max = (stack_limit ? (rlim_t)stack_limit * 1024 : RLIM_INFINITY);
583 if (setrlimit(RLIMIT_STACK, &rl) < 0)
584 die("setrlimit(RLIMIT_STACK): %m");
586 rl.rlim_cur = rl.rlim_max = 64;
587 if (setrlimit(RLIMIT_NOFILE, &rl) < 0)
588 die("setrlimit(RLIMIT_NOFILE): %m");
590 char **env = setup_environment();
591 execve(args[0], args, env);
592 die("execve(\"%s\"): %m", args[0]);
595 // FIXME: Prune (and also the getopt string)
599 fprintf(stderr, "Invalid arguments!\n");
601 Usage: box [<options>] -- <command> <arguments>\n\
604 -a <level>\tSet file access level (0=none, 1=cwd, 2=/etc,/lib,..., 3=whole fs, 9=no checks; needs -f)\n\
605 -c <dir>\tChange directory to <dir> first\n\
606 -e\t\tInherit full environment of the parent process\n\
607 -E <var>\tInherit the environment variable <var> from the parent process\n\
608 -E <var>=<val>\tSet the environment variable <var> to <val>; unset it if <var> is empty\n\
609 -f\t\tFilter system calls (-ff=very restricted)\n\
610 -i <file>\tRedirect stdin from <file>\n\
611 -k <size>\tLimit stack size to <size> KB (default: 0=unlimited)\n\
612 -m <size>\tLimit address space to <size> KB\n\
613 -M <file>\tOutput process information to <file> (name:value)\n\
614 -o <file>\tRedirect stdout to <file>\n\
615 -p <path>\tPermit access to the specified path (or subtree if it ends with a `/')\n\
616 -p <path>=<act>\tDefine action for the specified path (<act>=yes/no)\n\
617 -r <file>\tRedirect stderr to <file>\n\
618 -s <sys>\tPermit the specified syscall (be careful)\n\
619 -s <sys>=<act>\tDefine action for the specified syscall (<act>=yes/no/file)\n\
620 -t <time>\tSet run time limit (seconds, fractions allowed)\n\
621 -T\t\tAllow syscalls for measuring run time\n\
622 -v\t\tBe verbose (use multiple times for even more verbosity)\n\
623 -w <time>\tSet wall clock time limit (seconds, fractions allowed)\n\
624 -x <time>\tSet extra timeout, before which a timing-out program is not yet killed,\n\
625 \t\tso that its real execution time is reported (seconds, fractions allowed)\n\
631 main(int argc, char **argv)
636 while ((c = getopt(argc, argv, "a:c:eE:fi:k:m:M:o:p:r:s:t:Tvw:x:")) >= 0)
646 if (!set_env_action(optarg))
650 stack_limit = atol(optarg);
653 redir_stdin = optarg;
656 memory_limit = atol(optarg);
662 redir_stdout = optarg;
665 redir_stderr = optarg;
668 timeout = 1000*atof(optarg);
674 wall_timeout = 1000*atof(optarg);
677 extra_timeout = 1000*atof(optarg);
686 if (setreuid(uid, uid) < 0)
692 box_inside(argc-optind, argv+optind);
695 die("Internal error: fell over edge of the world");