2 * A Process Isolator based in Linux Containers
4 * (c) 2012 Martin Mares <mj@ucw.cz>
27 #include <sys/ptrace.h>
28 #include <sys/signal.h>
29 #include <sys/sysinfo.h>
30 #include <sys/resource.h>
31 #include <sys/mount.h>
34 #define NONRET __attribute__((noreturn))
35 #define UNUSED __attribute__((unused))
36 #define ARRAY_SIZE(a) (int)(sizeof(a)/sizeof(a[0]))
38 // FIXME: Make configurable, probably in compile time
39 #define BOX_DIR "/tmp/box"
43 static int timeout; /* milliseconds */
44 static int wall_timeout;
45 static int extra_timeout;
46 static int pass_environ;
48 static int memory_limit;
49 static int stack_limit;
50 static char *redir_stdin, *redir_stdout, *redir_stderr;
52 static uid_t orig_uid;
53 static gid_t orig_gid;
56 static volatile sig_atomic_t timer_tick;
57 static struct timeval start_time;
58 static int ticks_per_sec;
59 static int partial_line;
60 static char cleanup_cmd[256];
62 static int total_ms, wall_ms;
64 static void die(char *msg, ...) NONRET;
68 static FILE *metafile;
71 meta_open(const char *name)
73 if (!strcmp(name, "-"))
78 metafile = fopen(name, "w");
80 die("Failed to open metafile '%s'",name);
86 if (metafile && metafile != stdout)
90 static void __attribute__((format(printf,1,2)))
91 meta_printf(const char *fmt, ...)
98 vfprintf(metafile, fmt, args);
103 final_stats(struct rusage *rus)
105 struct timeval total, now, wall;
106 timeradd(&rus->ru_utime, &rus->ru_stime, &total);
107 total_ms = total.tv_sec*1000 + total.tv_usec/1000;
108 gettimeofday(&now, NULL);
109 timersub(&now, &start_time, &wall);
110 wall_ms = wall.tv_sec*1000 + wall.tv_usec/1000;
112 meta_printf("time:%d.%03d\n", total_ms/1000, total_ms%1000);
113 meta_printf("time-wall:%d.%03d\n", wall_ms/1000, wall_ms%1000);
116 /*** Messages and exits ***/
119 xsystem(const char *cmd)
121 int ret = system(cmd);
123 die("system(\"%s\"): %m", cmd);
124 if (!WIFEXITED(ret) || WEXITSTATUS(ret))
125 die("system(\"%s\"): Exited with status %d", cmd, ret);
133 kill(-box_pid, SIGKILL);
134 kill(box_pid, SIGKILL);
135 meta_printf("killed:1\n");
140 p = wait4(box_pid, &stat, 0, &rus);
141 while (p < 0 && errno == EINTR);
143 fprintf(stderr, "UGH: Lost track of the process (%m)\n");
148 if (rc < 2 && cleanup_cmd[0])
149 xsystem(cleanup_cmd);
163 /* Report an error of the sandbox itself */
164 static void NONRET __attribute__((format(printf,1,2)))
171 vsnprintf(buf, sizeof(buf), msg, args);
172 meta_printf("status:XX\nmessage:%s\n", buf);
178 /* Report an error of the program inside the sandbox */
179 static void NONRET __attribute__((format(printf,1,2)))
185 if (msg[0] && msg[1] && msg[2] == ':' && msg[3] == ' ')
187 meta_printf("status:%c%c\n", msg[0], msg[1]);
191 vsnprintf(buf, sizeof(buf), msg, args);
192 meta_printf("message:%s\n", buf);
198 /* Write a message, but only if in verbose mode */
199 static void __attribute__((format(printf,1,2)))
206 int len = strlen(msg);
208 partial_line = (msg[len-1] != '\n');
209 vfprintf(stderr, msg, args);
218 void *p = malloc(size);
220 die("Out of memory");
224 /*** Environment rules ***/
227 char *var; // Variable to match
228 char *val; // ""=clear, NULL=inherit
230 struct env_rule *next;
233 static struct env_rule *first_env_rule;
234 static struct env_rule **last_env_rule = &first_env_rule;
236 static struct env_rule default_env_rules[] = {
237 { "LIBC_FATAL_STDERR_", "1" }
241 set_env_action(char *a0)
243 struct env_rule *r = xmalloc(sizeof(*r) + strlen(a0) + 1);
244 char *a = (char *)(r+1);
247 char *sep = strchr(a, '=');
259 last_env_rule = &r->next;
265 match_env_var(char *env_entry, struct env_rule *r)
267 if (strncmp(env_entry, r->var, r->var_len))
269 return (env_entry[r->var_len] == '=');
273 apply_env_rule(char **env, int *env_sizep, struct env_rule *r)
275 // First remove the variable if already set
277 while (pos < *env_sizep && !match_env_var(env[pos], r))
279 if (pos < *env_sizep)
282 env[pos] = env[*env_sizep];
283 env[*env_sizep] = NULL;
286 // What is the new value?
292 new = xmalloc(r->var_len + 1 + strlen(r->val) + 1);
293 sprintf(new, "%s=%s", r->var, r->val);
298 while (environ[pos] && !match_env_var(environ[pos], r))
300 if (!(new = environ[pos]))
304 // Add it at the end of the array
305 env[(*env_sizep)++] = new;
306 env[*env_sizep] = NULL;
310 setup_environment(void)
312 // Link built-in rules with user rules
313 for (int i=ARRAY_SIZE(default_env_rules)-1; i >= 0; i--)
315 default_env_rules[i].next = first_env_rule;
316 first_env_rule = &default_env_rules[i];
319 // Scan the original environment
320 char **orig_env = environ;
322 while (orig_env[orig_size])
325 // For each rule, reserve one more slot and calculate length
327 for (struct env_rule *r = first_env_rule; r; r=r->next)
330 r->var_len = strlen(r->var);
333 // Create a new environment
334 char **env = xmalloc((orig_size + num_rules + 1) * sizeof(char *));
338 memcpy(env, environ, orig_size * sizeof(char *));
345 // Apply the rules one by one
346 for (struct env_rule *r = first_env_rule; r; r=r->next)
347 apply_env_rule(env, &size, r);
349 // Return the new env and pass some gossip
352 fprintf(stderr, "Passing environment:\n");
353 for (int i=0; env[i]; i++)
354 fprintf(stderr, "\t%s\n", env[i]);
359 /*** The keeper process ***/
362 signal_alarm(int unused UNUSED)
364 /* Time limit checks are synchronous, so we only schedule them there. */
370 signal_int(int unused UNUSED)
372 /* Interrupts are fatal, so no synchronization requirements. */
373 meta_printf("exitsig:%d\n", SIGINT);
374 err("SG: Interrupted");
377 #define PROC_BUF_SIZE 4096
379 read_proc_file(char *buf, char *name, int *fdp)
385 sprintf(buf, "/proc/%d/%s", (int) box_pid, name);
386 *fdp = open(buf, O_RDONLY);
388 die("open(%s): %m", buf);
390 lseek(*fdp, 0, SEEK_SET);
391 if ((c = read(*fdp, buf, PROC_BUF_SIZE-1)) < 0)
392 die("read on /proc/$pid/%s: %m", name);
393 if (c >= PROC_BUF_SIZE-1)
394 die("/proc/$pid/%s too long", name);
403 struct timeval now, wall;
405 gettimeofday(&now, NULL);
406 timersub(&now, &start_time, &wall);
407 wall_ms = wall.tv_sec*1000 + wall.tv_usec/1000;
408 if (wall_ms > wall_timeout)
409 err("TO: Time limit exceeded (wall clock)");
411 fprintf(stderr, "[wall time check: %d msec]\n", wall_ms);
415 char buf[PROC_BUF_SIZE], *x;
416 int utime, stime, ms;
417 static int proc_stat_fd;
418 read_proc_file(buf, "stat", &proc_stat_fd);
420 while (*x && *x != ' ')
425 die("proc stat syntax error 1");
426 while (*x && (*x != ')' || x[1] != ' '))
428 while (*x == ')' || *x == ' ')
430 if (sscanf(x, "%*c %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %d %d", &utime, &stime) != 2)
431 die("proc stat syntax error 2");
432 ms = (utime + stime) * 1000 / ticks_per_sec;
434 fprintf(stderr, "[time check: %d msec]\n", ms);
435 if (ms > timeout && ms > extra_timeout)
436 err("TO: Time limit exceeded");
445 bzero(&sa, sizeof(sa));
446 sa.sa_handler = signal_int;
447 sigaction(SIGINT, &sa, NULL);
449 gettimeofday(&start_time, NULL);
450 ticks_per_sec = sysconf(_SC_CLK_TCK);
451 if (ticks_per_sec <= 0)
452 die("Invalid ticks_per_sec!");
454 if (timeout || wall_timeout)
456 sa.sa_handler = signal_alarm;
457 sigaction(SIGALRM, &sa, NULL);
471 p = wait4(box_pid, &stat, 0, &rus);
479 die("wait4: unknown pid %d exited!", p);
484 if (WEXITSTATUS(stat))
486 // FIXME: Recognize internal errors during setup
487 meta_printf("exitcode:%d\n", WEXITSTATUS(stat));
488 err("RE: Exited with error status %d", WEXITSTATUS(stat));
490 if (timeout && total_ms > timeout)
491 err("TO: Time limit exceeded");
492 if (wall_timeout && wall_ms > wall_timeout)
493 err("TO: Time limit exceeded (wall clock)");
495 fprintf(stderr, "OK (%d.%03d sec real, %d.%03d sec wall)\n",
496 total_ms/1000, total_ms%1000,
497 wall_ms/1000, wall_ms%1000);
500 if (WIFSIGNALED(stat))
503 meta_printf("exitsig:%d\n", WTERMSIG(stat));
505 err("SG: Caught fatal signal %d", WTERMSIG(stat));
507 if (WIFSTOPPED(stat))
510 meta_printf("exitsig:%d\n", WSTOPSIG(stat));
512 err("SG: Stopped by signal %d", WSTOPSIG(stat));
515 die("wait4: unknown status %x, giving up!", stat);
519 /*** The process running inside the box ***/
526 if (mkdir("root", 0777) < 0 && errno != EEXIST)
527 die("mkdir('root'): %m");
529 if (mount("none", "root", "tmpfs", 0, "mode=755") < 0)
530 die("Cannot mount root ramdisk: %m");
532 // FIXME: Make the list of bind-mounts configurable
533 // FIXME: Virtual /dev?
534 // FIXME: Read-only mounts?
536 static const char * const dirs[] = { "box", "/bin", "/lib", "/usr", "/dev" };
537 for (int i=0; i < ARRAY_SIZE(dirs); i++)
539 const char *d = dirs[i];
540 char buf[1024]; // FIXME
541 sprintf(buf, "root/%s", (d[0] == '/' ? d+1 : d));
542 msg("Binding %s on %s\n", d, buf);
543 if (mkdir(buf, 0777) < 0)
544 die("mkdir(%s): %m", buf);
545 if (mount(d, buf, "none", MS_BIND | MS_NOSUID | MS_NODEV, "") < 0)
546 die("Cannot bind %s on %s: %m", d, buf);
549 if (mkdir("root/proc", 0777) < 0)
550 die("Cannot create proc: %m");
551 if (mount("none", "root/proc", "proc", 0, "") < 0)
552 die("Cannot mount proc: %m");
554 if (chroot("root") < 0)
555 die("Chroot failed: %m");
557 if (chdir("root/box") < 0)
558 die("Cannot change current directory: %m");
562 box_inside(void *arg)
572 memcpy(args, argv, argc * sizeof(char *));
577 if (setresgid(BOX_GID, BOX_GID, BOX_GID) < 0)
578 die("setresgid: %m");
579 if (setgroups(0, NULL) < 0)
580 die("setgroups: %m");
581 if (setresuid(BOX_UID, BOX_UID, BOX_UID) < 0)
582 die("setresuid: %m");
587 if (open(redir_stdin, O_RDONLY) != 0)
588 die("open(\"%s\"): %m", redir_stdin);
593 if (open(redir_stdout, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 1)
594 die("open(\"%s\"): %m", redir_stdout);
599 if (open(redir_stderr, O_WRONLY | O_CREAT | O_TRUNC, 0666) != 2)
600 die("open(\"%s\"): %m", redir_stderr);
608 rl.rlim_cur = rl.rlim_max = memory_limit * 1024;
609 if (setrlimit(RLIMIT_AS, &rl) < 0)
610 die("setrlimit(RLIMIT_AS): %m");
613 rl.rlim_cur = rl.rlim_max = (stack_limit ? (rlim_t)stack_limit * 1024 : RLIM_INFINITY);
614 if (setrlimit(RLIMIT_STACK, &rl) < 0)
615 die("setrlimit(RLIMIT_STACK): %m");
617 rl.rlim_cur = rl.rlim_max = 64;
618 if (setrlimit(RLIMIT_NOFILE, &rl) < 0)
619 die("setrlimit(RLIMIT_NOFILE): %m");
621 // FIXME: Create multi-process mode
622 rl.rlim_cur = rl.rlim_max = 1;
623 if (setrlimit(RLIMIT_NPROC, &rl) < 0)
624 die("setrlimit(RLIMIT_NPROC): %m");
626 rl.rlim_cur = rl.rlim_max = 0;
627 if (setrlimit(RLIMIT_MEMLOCK, &rl) < 0)
628 die("setrlimit(RLIMIT_MEMLOCK): %m");
630 char **env = setup_environment();
631 execve(args[0], args, env);
632 die("execve(\"%s\"): %m", args[0]);
638 msg("Preparing sandbox directory\n");
639 xsystem("rm -rf box");
640 if (mkdir("box", 0700) < 0)
641 die("Cannot create box: %m");
642 if (chown("box", orig_uid, orig_gid) < 0)
643 die("Cannot chown box: %m");
649 msg("Deleting sandbox directory\n");
650 xsystem("rm -rf box");
657 if (stat("box", &st) < 0 || !S_ISDIR(st.st_mode))
658 die("Box directory not found, did you run `isolate --prepare'?");
661 snprintf(cmd, sizeof(cmd), "chown -R %d.%d box", BOX_UID, BOX_GID);
663 snprintf(cleanup_cmd, sizeof(cleanup_cmd), "chown -R %d.%d box", orig_uid, orig_gid);
666 box_inside, // Function to execute as the body of the new process
667 argv, // Pass our stack
668 SIGCHLD | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWNS | CLONE_NEWPID,
669 argv); // Pass the arguments
673 die("clone returned 0");
681 printf("Process isolator 0.0\n");
682 printf("(c) 2012 Martin Mares <mj@ucw.cz>\n\n");
683 printf("Sandbox directory: %s\n", BOX_DIR);
684 printf("Sandbox credentials: uid=%u gid=%u\n", BOX_UID, BOX_GID);
690 fprintf(stderr, "Invalid arguments!\n");
692 Usage: isolate [<options>] <command>\n\
695 -e, --full-env\t\tInherit full environment of the parent process\n\
696 -E, --env=<var>\tInherit the environment variable <var> from the parent process\n\
697 -E, --env=<var>=<val>\tSet the environment variable <var> to <val>; unset it if <var> is empty\n\
698 -i, --stdin=<file>\tRedirect stdin from <file>\n\
699 -k, --stack=<size>\tLimit stack size to <size> KB (default: 0=unlimited)\n\
700 -m, --mem=<size>\tLimit address space to <size> KB\n\
701 -M, --meta=<file>\tOutput process information to <file> (name:value)\n\
702 -o, --stdout=<file>\tRedirect stdout to <file>\n\
703 -r, --stderr=<file>\tRedirect stderr to <file>\n\
704 -t, --time=<time>\tSet run time limit (seconds, fractions allowed)\n\
705 -v, --verbose\t\tBe verbose (use multiple times for even more verbosity)\n\
706 -w, --wall-time=<time>\tSet wall clock time limit (seconds, fractions allowed)\n\
707 -x, --extra-time=<time>\tSet extra timeout, before which a timing-out program is not yet killed,\n\
708 \t\t\tso that its real execution time is reported (seconds, fractions allowed)\n\
711 --prepare\t\tInitialize sandbox\n\
712 --run -- <cmd> ...\tRun given command within sandbox\n\
713 --cleanup\t\tClean up sandbox\n\
714 --version\t\tDisplay program version and configuration\n\
726 static const char short_opts[] = "eE:i:k:m:M:o:r:t:vw:x:";
728 static const struct option long_opts[] = {
729 { "full-env", 0, NULL, 'e' },
730 { "env", 1, NULL, 'E' },
731 { "stdin", 1, NULL, 'i' },
732 { "stack", 1, NULL, 'k' },
733 { "mem", 1, NULL, 'm' },
734 { "meta", 1, NULL, 'M' },
735 { "stdout", 1, NULL, 'o' },
736 { "stderr", 1, NULL, 'r' },
737 { "time", 1, NULL, 't' },
738 { "verbose", 0, NULL, 'v' },
739 { "wall-time", 1, NULL, 'w' },
740 { "extra-time", 1, NULL, 'x' },
741 { "prepare", 0, NULL, OPT_PREPARE },
742 { "run", 0, NULL, OPT_RUN },
743 { "cleanup", 0, NULL, OPT_CLEANUP },
744 { "version", 0, NULL, OPT_VERSION },
749 main(int argc, char **argv)
752 enum opt_code mode = 0;
754 while ((c = getopt_long(argc, argv, short_opts, long_opts, NULL)) >= 0)
761 if (!set_env_action(optarg))
765 stack_limit = atol(optarg);
768 redir_stdin = optarg;
771 memory_limit = atol(optarg);
777 redir_stdout = optarg;
780 redir_stderr = optarg;
783 timeout = 1000*atof(optarg);
789 wall_timeout = 1000*atof(optarg);
792 extra_timeout = 1000*atof(optarg);
806 if (mode == OPT_VERSION)
813 die("Must be started as root");
817 if (chdir(BOX_DIR) < 0)
818 die("chdir(%s): %m", BOX_DIR);
838 die("Internal error: mode mismatch");