2 * A Process Isolator based in Linux Containers
4 * (c) 2012 Martin Mares <mj@ucw.cz>
27 #include <sys/ptrace.h>
28 #include <sys/signal.h>
29 #include <sys/sysinfo.h>
30 #include <sys/resource.h>
31 #include <sys/mount.h>
34 #define NONRET __attribute__((noreturn))
35 #define UNUSED __attribute__((unused))
36 #define ARRAY_SIZE(a) (int)(sizeof(a)/sizeof(a[0]))
38 // FIXME: Make configurable, probably in compile time
39 #define BOX_DIR "/tmp/box"
43 static int timeout; /* milliseconds */
44 static int wall_timeout;
45 static int extra_timeout;
46 static int pass_environ;
48 static int memory_limit;
49 static int stack_limit;
50 static char *redir_stdin, *redir_stdout, *redir_stderr;
54 static volatile int timer_tick;
55 static struct timeval start_time;
56 static int ticks_per_sec;
57 static int partial_line;
59 static int mem_peak_kb;
60 static int total_ms, wall_ms;
62 static void die(char *msg, ...) NONRET;
63 static void sample_mem_peak(void);
67 static FILE *metafile;
70 meta_open(const char *name)
72 if (!strcmp(name, "-"))
77 metafile = fopen(name, "w");
79 die("Failed to open metafile '%s'",name);
85 if (metafile && metafile != stdout)
89 static void __attribute__((format(printf,1,2)))
90 meta_printf(const char *fmt, ...)
97 vfprintf(metafile, fmt, args);
102 final_stats(struct rusage *rus)
104 struct timeval total, now, wall;
105 timeradd(&rus->ru_utime, &rus->ru_stime, &total);
106 total_ms = total.tv_sec*1000 + total.tv_usec/1000;
107 gettimeofday(&now, NULL);
108 timersub(&now, &start_time, &wall);
109 wall_ms = wall.tv_sec*1000 + wall.tv_usec/1000;
111 meta_printf("time:%d.%03d\n", total_ms/1000, total_ms%1000);
112 meta_printf("time-wall:%d.%03d\n", wall_ms/1000, wall_ms%1000);
113 meta_printf("mem:%llu\n", (unsigned long long) mem_peak_kb * 1024);
116 /*** Messages and exits ***/
124 kill(-box_pid, SIGKILL);
125 kill(box_pid, SIGKILL);
126 meta_printf("killed:1\n");
131 p = wait4(box_pid, &stat, 0, &rus);
132 while (p < 0 && errno == EINTR);
134 fprintf(stderr, "UGH: Lost track of the process (%m)\n");
150 /* Report an error of the sandbox itself */
151 static void NONRET __attribute__((format(printf,1,2)))
158 vsnprintf(buf, sizeof(buf), msg, args);
159 meta_printf("status:XX\nmessage:%s\n", buf);
165 /* Report an error of the program inside the sandbox */
166 static void NONRET __attribute__((format(printf,1,2)))
172 if (msg[0] && msg[1] && msg[2] == ':' && msg[3] == ' ')
174 meta_printf("status:%c%c\n", msg[0], msg[1]);
178 vsnprintf(buf, sizeof(buf), msg, args);
179 meta_printf("message:%s\n", buf);
185 /* Write a message, but only if in verbose mode */
186 static void __attribute__((format(printf,1,2)))
193 int len = strlen(msg);
195 partial_line = (msg[len-1] != '\n');
196 vfprintf(stderr, msg, args);
205 void *p = malloc(size);
207 die("Out of memory");
211 /*** Environment rules ***/
214 char *var; // Variable to match
215 char *val; // ""=clear, NULL=inherit
217 struct env_rule *next;
220 static struct env_rule *first_env_rule;
221 static struct env_rule **last_env_rule = &first_env_rule;
223 static struct env_rule default_env_rules[] = {
224 { "LIBC_FATAL_STDERR_", "1" }
228 set_env_action(char *a0)
230 struct env_rule *r = xmalloc(sizeof(*r) + strlen(a0) + 1);
231 char *a = (char *)(r+1);
234 char *sep = strchr(a, '=');
246 last_env_rule = &r->next;
252 match_env_var(char *env_entry, struct env_rule *r)
254 if (strncmp(env_entry, r->var, r->var_len))
256 return (env_entry[r->var_len] == '=');
260 apply_env_rule(char **env, int *env_sizep, struct env_rule *r)
262 // First remove the variable if already set
264 while (pos < *env_sizep && !match_env_var(env[pos], r))
266 if (pos < *env_sizep)
269 env[pos] = env[*env_sizep];
270 env[*env_sizep] = NULL;
273 // What is the new value?
279 new = xmalloc(r->var_len + 1 + strlen(r->val) + 1);
280 sprintf(new, "%s=%s", r->var, r->val);
285 while (environ[pos] && !match_env_var(environ[pos], r))
287 if (!(new = environ[pos]))
291 // Add it at the end of the array
292 env[(*env_sizep)++] = new;
293 env[*env_sizep] = NULL;
297 setup_environment(void)
299 // Link built-in rules with user rules
300 for (int i=ARRAY_SIZE(default_env_rules)-1; i >= 0; i--)
302 default_env_rules[i].next = first_env_rule;
303 first_env_rule = &default_env_rules[i];
306 // Scan the original environment
307 char **orig_env = environ;
309 while (orig_env[orig_size])
312 // For each rule, reserve one more slot and calculate length
314 for (struct env_rule *r = first_env_rule; r; r=r->next)
317 r->var_len = strlen(r->var);
320 // Create a new environment
321 char **env = xmalloc((orig_size + num_rules + 1) * sizeof(char *));
325 memcpy(env, environ, orig_size * sizeof(char *));
332 // Apply the rules one by one
333 for (struct env_rule *r = first_env_rule; r; r=r->next)
334 apply_env_rule(env, &size, r);
336 // Return the new env and pass some gossip
339 fprintf(stderr, "Passing environment:\n");
340 for (int i=0; env[i]; i++)
341 fprintf(stderr, "\t%s\n", env[i]);
349 signal_alarm(int unused UNUSED)
351 /* Time limit checks are synchronous, so we only schedule them there. */
357 signal_int(int unused UNUSED)
359 /* Interrupts are fatal, so no synchronization requirements. */
360 meta_printf("exitsig:%d\n", SIGINT);
361 err("SG: Interrupted");
364 #define PROC_BUF_SIZE 4096
366 read_proc_file(char *buf, char *name, int *fdp)
372 sprintf(buf, "/proc/%d/%s", (int) box_pid, name);
373 *fdp = open(buf, O_RDONLY);
375 die("open(%s): %m", buf);
377 lseek(*fdp, 0, SEEK_SET);
378 if ((c = read(*fdp, buf, PROC_BUF_SIZE-1)) < 0)
379 die("read on /proc/$pid/%s: %m", name);
380 if (c >= PROC_BUF_SIZE-1)
381 die("/proc/$pid/%s too long", name);
390 struct timeval now, wall;
392 gettimeofday(&now, NULL);
393 timersub(&now, &start_time, &wall);
394 wall_ms = wall.tv_sec*1000 + wall.tv_usec/1000;
395 if (wall_ms > wall_timeout)
396 err("TO: Time limit exceeded (wall clock)");
398 fprintf(stderr, "[wall time check: %d msec]\n", wall_ms);
402 char buf[PROC_BUF_SIZE], *x;
403 int utime, stime, ms;
404 static int proc_stat_fd;
405 read_proc_file(buf, "stat", &proc_stat_fd);
407 while (*x && *x != ' ')
412 die("proc stat syntax error 1");
413 while (*x && (*x != ')' || x[1] != ' '))
415 while (*x == ')' || *x == ' ')
417 if (sscanf(x, "%*c %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %d %d", &utime, &stime) != 2)
418 die("proc stat syntax error 2");
419 ms = (utime + stime) * 1000 / ticks_per_sec;
421 fprintf(stderr, "[time check: %d msec]\n", ms);
422 if (ms > timeout && ms > extra_timeout)
423 err("TO: Time limit exceeded");
428 sample_mem_peak(void)
431 * We want to find out the peak memory usage of the process, which is
432 * maintained by the kernel, but unforunately it gets lost when the
433 * process exits (it is not reported in struct rusage). Therefore we
434 * have to sample it whenever we suspect that the process is about
437 char buf[PROC_BUF_SIZE], *x;
438 static int proc_status_fd;
439 read_proc_file(buf, "status", &proc_status_fd);
445 while (*x && *x != ':' && *x != '\n')
447 if (!*x || *x == '\n')
450 while (*x == ' ' || *x == '\t')
454 while (*x && *x != '\n')
460 if (!strcmp(key, "VmPeak"))
462 int peak = atoi(val);
463 if (peak > mem_peak_kb)
469 msg("[mem-peak: %u KB]\n", mem_peak_kb);
477 bzero(&sa, sizeof(sa));
478 sa.sa_handler = signal_int;
479 sigaction(SIGINT, &sa, NULL);
481 gettimeofday(&start_time, NULL);
482 ticks_per_sec = sysconf(_SC_CLK_TCK);
483 if (ticks_per_sec <= 0)
484 die("Invalid ticks_per_sec!");
486 if (timeout || wall_timeout)
488 sa.sa_handler = signal_alarm;
489 sigaction(SIGALRM, &sa, NULL);
503 p = wait4(box_pid, &stat, 0, &rus);
511 die("wait4: unknown pid %d exited!", p);
516 if (WEXITSTATUS(stat))
518 // FIXME: Recognize internal errors during setup
519 meta_printf("exitcode:%d\n", WEXITSTATUS(stat));
520 err("RE: Exited with error status %d", WEXITSTATUS(stat));
522 if (timeout && total_ms > timeout)
523 err("TO: Time limit exceeded");
524 if (wall_timeout && wall_ms > wall_timeout)
525 err("TO: Time limit exceeded (wall clock)");
527 fprintf(stderr, "OK (%d.%03d sec real, %d.%03d sec wall, %d MB)\n",
528 total_ms/1000, total_ms%1000,
529 wall_ms/1000, wall_ms%1000,
530 (mem_peak_kb + 1023) / 1024);
533 if (WIFSIGNALED(stat))
536 meta_printf("exitsig:%d\n", WTERMSIG(stat));
538 err("SG: Caught fatal signal %d", WTERMSIG(stat));
540 if (WIFSTOPPED(stat))
543 meta_printf("exitsig:%d\n", WSTOPSIG(stat));
545 err("SG: Stopped by signal %d", WSTOPSIG(stat));
548 die("wait4: unknown status %x, giving up!", stat);
557 if (mkdir("root", 0777) < 0 && errno != EEXIST)
558 die("mkdir('root'): %m");
560 if (mount("none", "root", "tmpfs", 0, "mode=755") < 0)
561 die("Cannot mount root ramdisk: %m");
563 // FIXME: Make the list of bind-mounts configurable
564 // FIXME: Virtual dev?
565 // FIXME: Read-only mounts?
567 static const char * const dirs[] = { "box", "/bin", "/lib", "/usr", "/dev" };
568 for (int i=0; i < ARRAY_SIZE(dirs); i++)
570 const char *d = dirs[i];
571 char buf[1024]; // FIXME
572 sprintf(buf, "root/%s", (d[0] == '/' ? d+1 : d));
573 printf("Binding %s on %s\n", d, buf);
574 if (mkdir(buf, 0777) < 0)
575 die("mkdir(%s): %m", buf);
576 if (mount(d, buf, "none", MS_BIND | MS_NOSUID | MS_NODEV, "") < 0)
577 die("Cannot bind %s on %s: %m", d, buf);
580 if (mkdir("root/proc", 0777) < 0)
581 die("Cannot create proc: %m");
582 if (mount("none", "root/proc", "proc", 0, "") < 0)
583 die("Cannot mount proc: %m");
585 if (chroot("root") < 0)
586 die("Chroot failed: %m");
588 if (chdir("root/box") < 0)
589 die("Cannot change current directory: %m");
593 box_inside(void *arg)
603 memcpy(args, argv, argc * sizeof(char *));
608 if (setresgid(BOX_GID, BOX_GID, BOX_GID) < 0)
609 die("setresgid: %m");
610 if (setgroups(0, NULL) < 0)
611 die("setgroups: %m");
612 if (setresuid(BOX_UID, BOX_UID, BOX_UID) < 0)
613 die("setresuid: %m");
618 if (open(redir_stdin, O_RDONLY) != 0)
619 die("open(\"%s\"): %m", redir_stdin);
624 if (open(redir_stdout, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 1)
625 die("open(\"%s\"): %m", redir_stdout);
630 if (open(redir_stderr, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 2)
631 die("open(\"%s\"): %m", redir_stderr);
639 rl.rlim_cur = rl.rlim_max = memory_limit * 1024;
640 if (setrlimit(RLIMIT_AS, &rl) < 0)
641 die("setrlimit(RLIMIT_AS): %m");
644 rl.rlim_cur = rl.rlim_max = (stack_limit ? (rlim_t)stack_limit * 1024 : RLIM_INFINITY);
645 if (setrlimit(RLIMIT_STACK, &rl) < 0)
646 die("setrlimit(RLIMIT_STACK): %m");
648 rl.rlim_cur = rl.rlim_max = 64;
649 if (setrlimit(RLIMIT_NOFILE, &rl) < 0)
650 die("setrlimit(RLIMIT_NOFILE): %m");
652 // FIXME: Create multi-process mode
653 rl.rlim_cur = rl.rlim_max = 1;
654 if (setrlimit(RLIMIT_NPROC, &rl) < 0)
655 die("setrlimit(RLIMIT_NPROC): %m");
657 rl.rlim_cur = rl.rlim_max = 0;
658 if (setrlimit(RLIMIT_MEMLOCK, &rl) < 0)
659 die("setrlimit(RLIMIT_MEMLOCK): %m");
661 char **env = setup_environment();
662 execve(args[0], args, env);
663 die("execve(\"%s\"): %m", args[0]);
669 // FIXME: Move chdir to common code?
670 if (chdir(BOX_DIR) < 0)
671 die("chdir(%s): %m", BOX_DIR);
673 if (system("./prepare"))
674 die("Prepare hook failed");
680 if (chdir(BOX_DIR) < 0)
681 die("chdir(%s): %m", BOX_DIR);
683 if (system("./cleanup"))
684 die("Prepare hook failed");
690 if (chdir(BOX_DIR) < 0)
691 die("chdir(%s): %m", BOX_DIR);
694 if (stat("box", &st) < 0 || !S_ISDIR(st.st_mode))
695 die("Box directory not found, did you run `isolate --prepare'?");
698 box_inside, // Function to execute as the body of the new process
699 argv, // Pass our stack
700 SIGCHLD | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWNS | CLONE_NEWPID,
701 argv); // Pass the arguments
705 die("clone returned 0");
709 // FIXME: Prune (and also the option list)
713 fprintf(stderr, "Invalid arguments!\n");
715 Usage: box [<options>] -- <command> <arguments>\n\
718 -a <level>\tSet file access level (0=none, 1=cwd, 2=/etc,/lib,..., 3=whole fs, 9=no checks; needs -f)\n\
719 -c <dir>\tChange directory to <dir> first\n\
720 -e\t\tInherit full environment of the parent process\n\
721 -E <var>\tInherit the environment variable <var> from the parent process\n\
722 -E <var>=<val>\tSet the environment variable <var> to <val>; unset it if <var> is empty\n\
723 -f\t\tFilter system calls (-ff=very restricted)\n\
724 -i <file>\tRedirect stdin from <file>\n\
725 -k <size>\tLimit stack size to <size> KB (default: 0=unlimited)\n\
726 -m <size>\tLimit address space to <size> KB\n\
727 -M <file>\tOutput process information to <file> (name:value)\n\
728 -o <file>\tRedirect stdout to <file>\n\
729 -p <path>\tPermit access to the specified path (or subtree if it ends with a `/')\n\
730 -p <path>=<act>\tDefine action for the specified path (<act>=yes/no)\n\
731 -r <file>\tRedirect stderr to <file>\n\
732 -s <sys>\tPermit the specified syscall (be careful)\n\
733 -s <sys>=<act>\tDefine action for the specified syscall (<act>=yes/no/file)\n\
734 -t <time>\tSet run time limit (seconds, fractions allowed)\n\
735 -T\t\tAllow syscalls for measuring run time\n\
736 -v\t\tBe verbose (use multiple times for even more verbosity)\n\
737 -w <time>\tSet wall clock time limit (seconds, fractions allowed)\n\
738 -x <time>\tSet extra timeout, before which a timing-out program is not yet killed,\n\
739 \t\tso that its real execution time is reported (seconds, fractions allowed)\n\
750 static const char short_opts[] = "a:c:eE:fi:k:m:M:o:p:r:s:t:Tvw:x:";
752 static const struct option long_opts[] = {
753 { "prepare", 0, NULL, OPT_PREPARE },
754 { "run", 0, NULL, OPT_RUN },
755 { "cleanup", 0, NULL, OPT_CLEANUP },
760 main(int argc, char **argv)
763 enum opt_code mode = 0;
765 while ((c = getopt_long(argc, argv, short_opts, long_opts, NULL)) >= 0)
775 if (!set_env_action(optarg))
779 stack_limit = atol(optarg);
782 redir_stdin = optarg;
785 memory_limit = atol(optarg);
791 redir_stdout = optarg;
794 redir_stderr = optarg;
797 timeout = 1000*atof(optarg);
803 wall_timeout = 1000*atof(optarg);
806 extra_timeout = 1000*atof(optarg);
818 die("Must be started as root");
820 // FIXME: Copying of files into the box
840 die("Internal error: mode mismatch");