X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;ds=inline;f=isolate%2Fisolate.c;h=b0cbb9aefa23276e312b25c399b0b62b26f4f716;hb=8e14b1a828f7aa8eae6f82f5fc965812628eca04;hp=c2e9b7162f98abf6c9329fbec090df137d37e211;hpb=784505ffb97052532e2b1f4f2df81b301cba2e6a;p=eval.git diff --git a/isolate/isolate.c b/isolate/isolate.c index c2e9b71..b0cbb9a 100644 --- a/isolate/isolate.c +++ b/isolate/isolate.c @@ -1,14 +1,14 @@ /* - * A Process Isolator based in Linux Containers + * A Process Isolator based on Linux Containers * - * (c) 2012 Martin Mares + * (c) 2012-2014 Martin Mares + * (c) 2012-2014 Bernard Blackham */ #define _GNU_SOURCE #include "autoconf.h" -// FIXME: prune #include #include #include @@ -20,26 +20,23 @@ #include #include #include +#include #include +#include +#include #include -#include #include -#include #include -#include #include #include #include +#include +#include #define NONRET __attribute__((noreturn)) #define UNUSED __attribute__((unused)) #define ARRAY_SIZE(a) (int)(sizeof(a)/sizeof(a[0])) -// FIXME: Make configurable, probably in compile time -#define BOX_DIR "/tmp/box" -#define BOX_UID 60000 -#define BOX_GID 60000 - static int timeout; /* milliseconds */ static int wall_timeout; static int extra_timeout; @@ -47,21 +44,43 @@ static int pass_environ; static int verbose; static int memory_limit; static int stack_limit; +static int block_quota; +static int inode_quota; +static int max_processes = 1; static char *redir_stdin, *redir_stdout, *redir_stderr; +static char *set_cwd; + +static int cg_enable; +static int cg_memory_limit; +static int cg_timing; + +static int box_id; +static char box_dir[1024]; +static pid_t box_pid; +static uid_t box_uid; +static gid_t box_gid; static uid_t orig_uid; static gid_t orig_gid; -static pid_t box_pid; -static volatile sig_atomic_t timer_tick; -static struct timeval start_time; -static int ticks_per_sec; static int partial_line; -static char cleanup_cmd[256]; +static int cleanup_ownership; +static struct timeval start_time; +static int ticks_per_sec; static int total_ms, wall_ms; +static volatile sig_atomic_t timer_tick; + +static int error_pipes[2]; +static int write_errors_to_fd; +static int read_errors_from_fd; static void die(char *msg, ...) NONRET; +static void cg_stats(void); +static int get_wall_time_ms(void); +static int get_run_time_ms(struct rusage *rus); + +static void chowntree(char *path, uid_t uid, gid_t gid); /*** Meta-files ***/ @@ -102,29 +121,20 @@ meta_printf(const char *fmt, ...) static void final_stats(struct rusage *rus) { - struct timeval total, now, wall; - timeradd(&rus->ru_utime, &rus->ru_stime, &total); - total_ms = total.tv_sec*1000 + total.tv_usec/1000; - gettimeofday(&now, NULL); - timersub(&now, &start_time, &wall); - wall_ms = wall.tv_sec*1000 + wall.tv_usec/1000; + total_ms = get_run_time_ms(rus); + wall_ms = get_wall_time_ms(); meta_printf("time:%d.%03d\n", total_ms/1000, total_ms%1000); meta_printf("time-wall:%d.%03d\n", wall_ms/1000, wall_ms%1000); + meta_printf("max-rss:%ld\n", rus->ru_maxrss); + meta_printf("csw-voluntary:%ld\n", rus->ru_nvcsw); + meta_printf("csw-forced:%ld\n", rus->ru_nivcsw); + + cg_stats(); } /*** Messages and exits ***/ -static void -xsystem(const char *cmd) -{ - int ret = system(cmd); - if (ret < 0) - die("system(\"%s\"): %m", cmd); - if (!WIFEXITED(ret) || WEXITSTATUS(ret)) - die("system(\"%s\"): Exited with status %d", cmd, ret); -} - static void NONRET box_exit(int rc) { @@ -145,8 +155,10 @@ box_exit(int rc) final_stats(&rus); } - if (rc < 2 && cleanup_cmd[0]) - xsystem(cleanup_cmd); + if (rc < 2 && cleanup_ownership) + { + chowntree("box", orig_uid, orig_gid); + } meta_close(); exit(rc); @@ -166,9 +178,19 @@ die(char *msg, ...) { va_list args; va_start(args, msg); - flush_line(); char buf[1024]; - vsnprintf(buf, sizeof(buf), msg, args); + int n = vsnprintf(buf, sizeof(buf), msg, args); + + if (write_errors_to_fd) + { + // We are inside the box, have to use error pipe for error reporting. + // We hope that the whole error message fits in PIPE_BUF bytes. + write(write_errors_to_fd, buf, n); + exit(2); + } + + // Otherwise, we in the box keeper process, so we report errors normally + flush_line(); meta_printf("status:XX\nmessage:%s\n", buf); fputs(buf, stderr); fputc('\n', stderr); @@ -212,6 +234,8 @@ msg(char *msg, ...) va_end(args); } +/*** Utility functions ***/ + static void * xmalloc(size_t size) { @@ -221,6 +245,62 @@ xmalloc(size_t size) return p; } +static char * +xstrdup(char *str) +{ + char *p = strdup(str); + if (!p) + die("Out of memory"); + return p; +} + +static int dir_exists(char *path) +{ + struct stat st; + return (stat(path, &st) >= 0 && S_ISDIR(st.st_mode)); +} + +static int rmtree_helper(const char *fpath, const struct stat *sb, + int typeflag, struct FTW *ftwbuf) +{ + if (S_ISDIR(sb->st_mode)) + { + if (rmdir(fpath) < 0) + die("Cannot rmdir %s: %m", fpath); + } + else + { + if (unlink(fpath) < 0) + die("Cannot unlink %s: %m", fpath); + } + return FTW_CONTINUE; +} + +static void +rmtree(char *path) +{ + nftw(path, rmtree_helper, 32, FTW_MOUNT | FTW_PHYS | FTW_DEPTH); +} + +static uid_t chown_uid; +static gid_t chown_gid; +static int chowntree_helper(const char *fpath, const struct stat *sb, + int typeflag, struct FTW *ftwbuf) +{ + if (lchown(fpath, chown_uid, chown_gid) < 0) + die("Cannot chown %s: %m", fpath); + else + return FTW_CONTINUE; +} + +static void +chowntree(char *path, uid_t uid, gid_t gid) +{ + chown_uid = uid; + chown_gid = gid; + nftw(path, chowntree_helper, 32, FTW_MOUNT | FTW_PHYS); +} + /*** Environment rules ***/ struct env_rule { @@ -356,6 +436,532 @@ setup_environment(void) return env; } +/*** Directory rules ***/ + +struct dir_rule { + char *inside; // A relative path + char *outside; // This can be an absolute path or a relative path starting with "./" + unsigned int flags; // DIR_FLAG_xxx + struct dir_rule *next; +}; + +enum dir_rule_flags { + DIR_FLAG_RW = 1, + DIR_FLAG_NOEXEC = 2, + DIR_FLAG_FS = 4, + DIR_FLAG_MAYBE = 8, + DIR_FLAG_DEV = 16, +}; + +static const char * const dir_flag_names[] = { "rw", "noexec", "fs", "maybe", "dev" }; + +static struct dir_rule *first_dir_rule; +static struct dir_rule **last_dir_rule = &first_dir_rule; + +static int add_dir_rule(char *in, char *out, unsigned int flags) +{ + // Make sure that "in" is relative + while (in[0] == '/') + in++; + if (!*in) + return 0; + + // Check "out" + if (flags & DIR_FLAG_FS) + { + if (!out || out[0] == '/') + return 0; + } + else + { + if (out && out[0] != '/' && strncmp(out, "./", 2)) + return 0; + } + + // Override an existing rule + struct dir_rule *r; + for (r = first_dir_rule; r; r = r->next) + if (!strcmp(r->inside, in)) + break; + + // Add a new rule + if (!r) + { + r = xmalloc(sizeof(*r)); + r->inside = in; + *last_dir_rule = r; + last_dir_rule = &r->next; + r->next = NULL; + } + r->outside = out; + r->flags = flags; + return 1; +} + +static unsigned int parse_dir_option(char *opt) +{ + for (unsigned int i = 0; i < ARRAY_SIZE(dir_flag_names); i++) + if (!strcmp(opt, dir_flag_names[i])) + return 1U << i; + die("Unknown directory option %s", opt); +} + +static int set_dir_action(char *arg) +{ + arg = xstrdup(arg); + + char *colon = strchr(arg, ':'); + unsigned int flags = 0; + while (colon) + { + *colon++ = 0; + char *next = strchr(colon, ':'); + if (next) + *next = 0; + flags |= parse_dir_option(colon); + colon = next; + } + + char *eq = strchr(arg, '='); + if (eq) + { + *eq++ = 0; + return add_dir_rule(arg, (*eq ? eq : NULL), flags); + } + else + { + char *out = xmalloc(1 + strlen(arg) + 1); + sprintf(out, "/%s", arg); + return add_dir_rule(arg, out, flags); + } +} + +static void init_dir_rules(void) +{ + set_dir_action("box=./box:rw"); + set_dir_action("bin"); + set_dir_action("dev:dev"); + set_dir_action("lib"); + set_dir_action("lib64:maybe"); + set_dir_action("proc=proc:fs"); + set_dir_action("usr"); +} + +static void make_dir(char *path) +{ + char *sep = (path[0] == '/' ? path+1 : path); + + for (;;) + { + sep = strchr(sep, '/'); + if (sep) + *sep = 0; + + if (!dir_exists(path) && mkdir(path, 0777) < 0) + die("Cannot create directory %s: %m\n", path); + + if (!sep) + return; + *sep++ = '/'; + } +} + +static void apply_dir_rules(void) +{ + for (struct dir_rule *r = first_dir_rule; r; r=r->next) + { + char *in = r->inside; + char *out = r->outside; + if (!out) + { + msg("Not binding anything on %s\n", r->inside); + continue; + } + + if ((r->flags & DIR_FLAG_MAYBE) && !dir_exists(out)) + { + msg("Not binding %s on %s (does not exist)\n", out, r->inside); + continue; + } + + char root_in[1024]; + snprintf(root_in, sizeof(root_in), "root/%s", in); + make_dir(root_in); + + unsigned long mount_flags = 0; + if (!(r->flags & DIR_FLAG_RW)) + mount_flags |= MS_RDONLY; + if (r->flags & DIR_FLAG_NOEXEC) + mount_flags |= MS_NOEXEC; + if (!(r->flags & DIR_FLAG_DEV)) + mount_flags |= MS_NODEV; + + if (r->flags & DIR_FLAG_FS) + { + msg("Mounting %s on %s (flags %lx)\n", out, in, mount_flags); + if (mount("none", root_in, out, mount_flags, "") < 0) + die("Cannot mount %s on %s: %m", out, in); + } + else + { + mount_flags |= MS_BIND | MS_NOSUID; + msg("Binding %s on %s (flags %lx)\n", out, in, mount_flags); + // Most mount flags need remount to work + if (mount(out, root_in, "none", mount_flags, "") < 0 || + mount(out, root_in, "none", MS_REMOUNT | mount_flags, "") < 0) + die("Cannot mount %s on %s: %m", out, in); + } + } +} + +/*** Control groups ***/ + +struct cg_controller_desc { + const char *name; + int optional; +}; + +typedef enum { + CG_MEMORY = 0, + CG_CPUACCT, + CG_CPUSET, + CG_NUM_CONTROLLERS, +} cg_controller; + +static const struct cg_controller_desc cg_controllers[CG_NUM_CONTROLLERS+1] = { + [CG_MEMORY] = { "memory", 0 }, + [CG_CPUACCT] = { "cpuacct", 0 }, + [CG_CPUSET] = { "cpuset", 1 }, + [CG_NUM_CONTROLLERS] = { NULL, 0 }, +}; + +#define FOREACH_CG_CONTROLLER(_controller) \ + for (cg_controller (_controller) = 0; \ + (_controller) < CG_NUM_CONTROLLERS; (_controller)++) + +static const char *cg_controller_name(cg_controller c) +{ + return cg_controllers[c].name; +} + +static const int cg_controller_optional(cg_controller c) +{ + return cg_controllers[c].optional; +} + +static char cg_name[256]; + +#define CG_BUFSIZE 1024 + +static void +cg_makepath(char *buf, size_t len, cg_controller c, const char *attr) +{ + const char *cg_root = CONFIG_ISOLATE_CGROUP_ROOT; + snprintf(buf, len, "%s/%s/%s/%s", cg_root, cg_controller_name(c), cg_name, attr); +} + +static int +cg_read(cg_controller controller, const char *attr, char *buf) +{ + int maybe = 0; + if (attr[0] == '?') + { + attr++; + maybe = 1; + } + + char path[256]; + cg_makepath(path, sizeof(path), controller, attr); + + int fd = open(path, O_RDONLY); + if (fd < 0) + { + if (maybe) + return 0; + die("Cannot read %s: %m", path); + } + + int n = read(fd, buf, CG_BUFSIZE); + if (n < 0) + { + if (maybe) + return 0; + die("Cannot read %s: %m", path); + } + if (n >= CG_BUFSIZE - 1) + die("Attribute %s too long", path); + if (n > 0 && buf[n-1] == '\n') + n--; + buf[n] = 0; + + if (verbose > 1) + msg("CG: Read %s = %s\n", attr, buf); + + close(fd); + return 1; +} + +static void __attribute__((format(printf,3,4))) +cg_write(cg_controller controller, const char *attr, const char *fmt, ...) +{ + int maybe = 0; + if (attr[0] == '?') + { + attr++; + maybe = 1; + } + + va_list args; + va_start(args, fmt); + + char buf[CG_BUFSIZE]; + int n = vsnprintf(buf, sizeof(buf), fmt, args); + if (n >= CG_BUFSIZE) + die("cg_write: Value for attribute %s is too long", attr); + + if (verbose > 1) + msg("CG: Write %s = %s", attr, buf); + + char path[256]; + cg_makepath(path, sizeof(path), controller, attr); + + int fd = open(path, O_WRONLY | O_TRUNC); + if (fd < 0) + { + if (maybe) + return; + else + die("Cannot write %s: %m", path); + } + + int written = write(fd, buf, n); + if (written < 0) + { + if (maybe) + return; + else + die("Cannot set %s to %s: %m", path, buf); + } + if (written != n) + die("Short write to %s (%d out of %d bytes)", path, written, n); + + close(fd); + va_end(args); +} + +static void +cg_init(void) +{ + if (!cg_enable) + return; + + char *cg_root = CONFIG_ISOLATE_CGROUP_ROOT; + if (!dir_exists(cg_root)) + die("Control group filesystem at %s not mounted", cg_root); + + snprintf(cg_name, sizeof(cg_name), "box-%d", box_id); + msg("Using control group %s\n", cg_name); +} + +static void +cg_prepare(void) +{ + if (!cg_enable) + return; + + struct stat st; + char buf[CG_BUFSIZE]; + char path[256]; + + FOREACH_CG_CONTROLLER(controller) + { + cg_makepath(path, sizeof(path), controller, ""); + if (stat(path, &st) >= 0 || errno != ENOENT) + { + msg("Control group %s already exists, trying to empty it.\n", path); + if (rmdir(path) < 0) + die("Failed to reset control group %s: %m", path); + } + + if (mkdir(path, 0777) < 0 && !cg_controller_optional(controller)) + die("Failed to create control group %s: %m", path); + } + + // If cpuset module is enabled, copy allowed cpus and memory nodes from parent group + if (cg_read(CG_CPUSET, "?cpuset.cpus", buf)) + cg_write(CG_CPUSET, "cpuset.cpus", "%s", buf); + if (cg_read(CG_CPUSET, "?cpuset.mems", buf)) + cg_write(CG_CPUSET, "cpuset.mems", "%s", buf); +} + +static void +cg_enter(void) +{ + if (!cg_enable) + return; + + msg("Entering control group %s\n", cg_name); + + FOREACH_CG_CONTROLLER(controller) + { + if (cg_controller_optional(controller)) + cg_write(controller, "?tasks", "%d\n", (int) getpid()); + else + cg_write(controller, "tasks", "%d\n", (int) getpid()); + } + + if (cg_memory_limit) + { + cg_write(CG_MEMORY, "memory.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); + cg_write(CG_MEMORY, "?memory.memsw.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); + } + + if (cg_timing) + cg_write(CG_CPUACCT, "cpuacct.usage", "0\n"); +} + +static int +cg_get_run_time_ms(void) +{ + if (!cg_enable) + return 0; + + char buf[CG_BUFSIZE]; + cg_read(CG_CPUACCT, "cpuacct.usage", buf); + unsigned long long ns = atoll(buf); + return ns / 1000000; +} + +static void +cg_stats(void) +{ + if (!cg_enable) + return; + + char buf[CG_BUFSIZE]; + + // Memory usage statistics + unsigned long long mem=0, memsw=0; + if (cg_read(CG_MEMORY, "?memory.max_usage_in_bytes", buf)) + mem = atoll(buf); + if (cg_read(CG_MEMORY, "?memory.memsw.max_usage_in_bytes", buf)) + { + memsw = atoll(buf); + if (memsw > mem) + mem = memsw; + } + if (mem) + meta_printf("cg-mem:%lld\n", mem >> 10); +} + +static void +cg_remove(void) +{ + char buf[CG_BUFSIZE]; + + if (!cg_enable) + return; + + FOREACH_CG_CONTROLLER(controller) + { + if (cg_controller_optional(controller)) { + if (!cg_read(controller, "?tasks", buf)) + continue; + } else + cg_read(controller, "tasks", buf); + + if (buf[0]) + die("Some tasks left in controller %s of cgroup %s, failed to remove it", + cg_controller_name(controller), cg_name); + + char path[256]; + cg_makepath(path, sizeof(path), controller, ""); + + if (rmdir(path) < 0) + die("Cannot remove control group %s: %m", path); + } +} + +/*** Disk quotas ***/ + +static int +path_begins_with(char *path, char *with) +{ + while (*with) + if (*path++ != *with++) + return 0; + return (!*with || *with == '/'); +} + +static char * +find_device(char *path) +{ + FILE *f = setmntent("/proc/mounts", "r"); + if (!f) + die("Cannot open /proc/mounts: %m"); + + struct mntent *me; + int best_len = 0; + char *best_dev = NULL; + while (me = getmntent(f)) + { + if (!path_begins_with(me->mnt_fsname, "/dev")) + continue; + if (path_begins_with(path, me->mnt_dir)) + { + int len = strlen(me->mnt_dir); + if (len > best_len) + { + best_len = len; + free(best_dev); + best_dev = xstrdup(me->mnt_fsname); + } + } + } + endmntent(f); + return best_dev; +} + +static void +set_quota(void) +{ + if (!block_quota) + return; + + char cwd[PATH_MAX]; + if (!getcwd(cwd, sizeof(cwd))) + die("getcwd: %m"); + + char *dev = find_device(cwd); + if (!dev) + die("Cannot identify filesystem which contains %s", cwd); + msg("Quota: Mapped path %s to a filesystem on %s\n", cwd, dev); + + // Sanity check + struct stat dev_st, cwd_st; + if (stat(dev, &dev_st) < 0) + die("Cannot identify block device %s: %m", dev); + if (!S_ISBLK(dev_st.st_mode)) + die("Expected that %s is a block device", dev); + if (stat(".", &cwd_st) < 0) + die("Cannot stat cwd: %m"); + if (cwd_st.st_dev != dev_st.st_rdev) + die("Identified %s as a filesystem on %s, but it is obviously false", cwd, dev); + + struct dqblk dq = { + .dqb_bhardlimit = block_quota, + .dqb_bsoftlimit = block_quota, + .dqb_ihardlimit = inode_quota, + .dqb_isoftlimit = inode_quota, + .dqb_valid = QIF_LIMITS, + }; + if (quotactl(QCMD(Q_SETQUOTA, USRQUOTA), dev, box_uid, (caddr_t) &dq) < 0) + die("Cannot set disk quota: %m"); + msg("Quota: Set block quota %d and inode quota %d\n", block_quota, inode_quota); + + free(dev); +} + /*** The keeper process ***/ static void @@ -395,16 +1001,56 @@ read_proc_file(char *buf, char *name, int *fdp) buf[c] = 0; } +static int +get_wall_time_ms(void) +{ + struct timeval now, wall; + gettimeofday(&now, NULL); + timersub(&now, &start_time, &wall); + return wall.tv_sec*1000 + wall.tv_usec/1000; +} + +static int +get_run_time_ms(struct rusage *rus) +{ + if (cg_timing) + return cg_get_run_time_ms(); + + if (rus) + { + struct timeval total; + timeradd(&rus->ru_utime, &rus->ru_stime, &total); + return total.tv_sec*1000 + total.tv_usec/1000; + } + + char buf[PROC_BUF_SIZE], *x; + int utime, stime; + static int proc_stat_fd; + + read_proc_file(buf, "stat", &proc_stat_fd); + x = buf; + while (*x && *x != ' ') + x++; + while (*x == ' ') + x++; + if (*x++ != '(') + die("proc stat syntax error 1"); + while (*x && (*x != ')' || x[1] != ' ')) + x++; + while (*x == ')' || *x == ' ') + x++; + if (sscanf(x, "%*c %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %d %d", &utime, &stime) != 2) + die("proc stat syntax error 2"); + + return (utime + stime) * 1000 / ticks_per_sec; +} + static void check_timeout(void) { if (wall_timeout) { - struct timeval now, wall; - int wall_ms; - gettimeofday(&now, NULL); - timersub(&now, &start_time, &wall); - wall_ms = wall.tv_sec*1000 + wall.tv_usec/1000; + int wall_ms = get_wall_time_ms(); if (wall_ms > wall_timeout) err("TO: Time limit exceeded (wall clock)"); if (verbose > 1) @@ -412,24 +1058,7 @@ check_timeout(void) } if (timeout) { - char buf[PROC_BUF_SIZE], *x; - int utime, stime, ms; - static int proc_stat_fd; - read_proc_file(buf, "stat", &proc_stat_fd); - x = buf; - while (*x && *x != ' ') - x++; - while (*x == ' ') - x++; - if (*x++ != '(') - die("proc stat syntax error 1"); - while (*x && (*x != ')' || x[1] != ' ')) - x++; - while (*x == ')' || *x == ' ') - x++; - if (sscanf(x, "%*c %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %d %d", &utime, &stime) != 2) - die("proc stat syntax error 2"); - ms = (utime + stime) * 1000 / ticks_per_sec; + int ms = get_run_time_ms(NULL); if (verbose > 1) fprintf(stderr, "[time check: %d msec]\n", ms); if (ms > timeout && ms > extra_timeout) @@ -440,8 +1069,10 @@ check_timeout(void) static void box_keeper(void) { - struct sigaction sa; + read_errors_from_fd = error_pipes[0]; + close(error_pipes[1]); + struct sigaction sa; bzero(&sa, sizeof(sa)); sa.sa_handler = signal_int; sigaction(SIGINT, &sa, NULL); @@ -477,13 +1108,22 @@ box_keeper(void) } if (p != box_pid) die("wait4: unknown pid %d exited!", p); + box_pid = 0; + + // Check error pipe if there is an internal error passed from inside the box + char interr[1024]; + int n = read(read_errors_from_fd, interr, sizeof(interr) - 1); + if (n > 0) + { + interr[n] = 0; + die("%s", interr); + } + if (WIFEXITED(stat)) { - box_pid = 0; final_stats(&rus); if (WEXITSTATUS(stat)) { - // FIXME: Recognize internal errors during setup meta_printf("exitcode:%d\n", WEXITSTATUS(stat)); err("RE: Exited with error status %d", WEXITSTATUS(stat)); } @@ -497,16 +1137,14 @@ box_keeper(void) wall_ms/1000, wall_ms%1000); box_exit(0); } - if (WIFSIGNALED(stat)) + else if (WIFSIGNALED(stat)) { - box_pid = 0; meta_printf("exitsig:%d\n", WTERMSIG(stat)); final_stats(&rus); err("SG: Caught fatal signal %d", WTERMSIG(stat)); } - if (WIFSTOPPED(stat)) + else if (WIFSTOPPED(stat)) { - box_pid = 0; meta_printf("exitsig:%d\n", WSTOPSIG(stat)); final_stats(&rus); err("SG: Stopped by signal %d", WSTOPSIG(stat)); @@ -521,35 +1159,13 @@ box_keeper(void) static void setup_root(void) { - umask(0027); - - if (mkdir("root", 0777) < 0 && errno != EEXIST) + if (mkdir("root", 0750) < 0 && errno != EEXIST) die("mkdir('root'): %m"); if (mount("none", "root", "tmpfs", 0, "mode=755") < 0) die("Cannot mount root ramdisk: %m"); - // FIXME: Make the list of bind-mounts configurable - // FIXME: Virtual /dev? - // FIXME: Read-only mounts? - - static const char * const dirs[] = { "box", "/bin", "/lib", "/usr", "/dev" }; - for (int i=0; i < ARRAY_SIZE(dirs); i++) - { - const char *d = dirs[i]; - char buf[1024]; // FIXME - sprintf(buf, "root/%s", (d[0] == '/' ? d+1 : d)); - msg("Binding %s on %s\n", d, buf); - if (mkdir(buf, 0777) < 0) - die("mkdir(%s): %m", buf); - if (mount(d, buf, "none", MS_BIND | MS_NOSUID | MS_NODEV, "") < 0) - die("Cannot bind %s on %s: %m", d, buf); - } - - if (mkdir("root/proc", 0777) < 0) - die("Cannot create proc: %m"); - if (mount("none", "root/proc", "proc", 0, "") < 0) - die("Cannot mount proc: %m"); + apply_dir_rules(); if (chroot("root") < 0) die("Chroot failed: %m"); @@ -558,29 +1174,21 @@ setup_root(void) die("Cannot change current directory: %m"); } -static int -box_inside(void *arg) +static void +setup_credentials(void) { - char **argv = arg; - int argc = 0; - while (argv[argc]) - argc++; - - struct rlimit rl; - char *args[argc+1]; - - memcpy(args, argv, argc * sizeof(char *)); - args[argc] = NULL; - - setup_root(); - - if (setresgid(BOX_GID, BOX_GID, BOX_GID) < 0) + if (setresgid(box_gid, box_gid, box_gid) < 0) die("setresgid: %m"); if (setgroups(0, NULL) < 0) die("setgroups: %m"); - if (setresuid(BOX_UID, BOX_UID, BOX_UID) < 0) + if (setresuid(box_uid, box_uid, box_uid) < 0) die("setresuid: %m"); + setpgrp(); +} +static void +setup_fds(void) +{ if (redir_stdin) { close(0); @@ -601,66 +1209,113 @@ box_inside(void *arg) } else dup2(1, 2); - setpgrp(); +} + +static void +setup_rlim(const char *res_name, int res, rlim_t limit) +{ + struct rlimit rl = { .rlim_cur = limit, .rlim_max = limit }; + if (setrlimit(res, &rl) < 0) + die("setrlimit(%s, %jd)", res_name, (intmax_t) limit); +} + +static void +setup_rlimits(void) +{ +#define RLIM(res, val) setup_rlim("RLIMIT_" #res, RLIMIT_##res, val) if (memory_limit) - { - rl.rlim_cur = rl.rlim_max = memory_limit * 1024; - if (setrlimit(RLIMIT_AS, &rl) < 0) - die("setrlimit(RLIMIT_AS): %m"); - } + RLIM(AS, memory_limit * 1024); - rl.rlim_cur = rl.rlim_max = (stack_limit ? (rlim_t)stack_limit * 1024 : RLIM_INFINITY); - if (setrlimit(RLIMIT_STACK, &rl) < 0) - die("setrlimit(RLIMIT_STACK): %m"); + RLIM(STACK, (stack_limit ? (rlim_t)stack_limit * 1024 : RLIM_INFINITY)); + RLIM(NOFILE, 64); + RLIM(MEMLOCK, 0); - rl.rlim_cur = rl.rlim_max = 64; - if (setrlimit(RLIMIT_NOFILE, &rl) < 0) - die("setrlimit(RLIMIT_NOFILE): %m"); + if (max_processes) + RLIM(NPROC, max_processes); - // FIXME: Create multi-process mode - rl.rlim_cur = rl.rlim_max = 1; - if (setrlimit(RLIMIT_NPROC, &rl) < 0) - die("setrlimit(RLIMIT_NPROC): %m"); +#undef RLIM +} - rl.rlim_cur = rl.rlim_max = 0; - if (setrlimit(RLIMIT_MEMLOCK, &rl) < 0) - die("setrlimit(RLIMIT_MEMLOCK): %m"); +static int +box_inside(void *arg) +{ + char **args = arg; + write_errors_to_fd = error_pipes[1]; + close(error_pipes[0]); + cg_enter(); + setup_root(); + setup_credentials(); + setup_fds(); + setup_rlimits(); char **env = setup_environment(); + + if (set_cwd && chdir(set_cwd)) + die("chdir: %m"); + execve(args[0], args, env); die("execve(\"%s\"): %m", args[0]); } static void -prepare(void) +box_init(void) +{ + if (box_id < 0 || box_id >= CONFIG_ISOLATE_NUM_BOXES) + die("Sandbox ID out of range (allowed: 0-%d)", CONFIG_ISOLATE_NUM_BOXES-1); + box_uid = CONFIG_ISOLATE_FIRST_UID + box_id; + box_gid = CONFIG_ISOLATE_FIRST_GID + box_id; + + snprintf(box_dir, sizeof(box_dir), "%s/%d", CONFIG_ISOLATE_BOX_DIR, box_id); + make_dir(box_dir); + if (chdir(box_dir) < 0) + die("chdir(%s): %m", box_dir); +} + +/*** Commands ***/ + +static void +init(void) { msg("Preparing sandbox directory\n"); - xsystem("rm -rf box"); + rmtree("box"); if (mkdir("box", 0700) < 0) die("Cannot create box: %m"); if (chown("box", orig_uid, orig_gid) < 0) die("Cannot chown box: %m"); + + cg_prepare(); + set_quota(); + + puts(box_dir); } static void cleanup(void) { + if (!dir_exists("box")) + die("Box directory not found, there isn't anything to clean up"); + msg("Deleting sandbox directory\n"); - xsystem("rm -rf box"); + rmtree(box_dir); + cg_remove(); } static void run(char **argv) { - struct stat st; - if (stat("box", &st) < 0 || !S_ISDIR(st.st_mode)) - die("Box directory not found, did you run `isolate --prepare'?"); + if (!dir_exists("box")) + die("Box directory not found, did you run `isolate --init'?"); - char cmd[256]; - snprintf(cmd, sizeof(cmd), "chown -R %d.%d box", BOX_UID, BOX_GID); - xsystem(cmd); - snprintf(cleanup_cmd, sizeof(cleanup_cmd), "chown -R %d.%d box", orig_uid, orig_gid); + chowntree("box", box_uid, box_gid); + cleanup_ownership = 1; + + if (pipe(error_pipes) < 0) + die("pipe: %m"); + for (int i=0; i<2; i++) + if (fcntl(error_pipes[i], F_SETFD, fcntl(error_pipes[i], F_GETFD) | FD_CLOEXEC) < 0 || + fcntl(error_pipes[i], F_SETFL, fcntl(error_pipes[i], F_GETFL) | O_NONBLOCK) < 0) + die("fcntl on pipe: %m"); box_pid = clone( box_inside, // Function to execute as the body of the new process @@ -677,38 +1332,65 @@ run(char **argv) static void show_version(void) { - // FIXME - printf("Process isolator 0.0\n"); - printf("(c) 2012 Martin Mares \n\n"); - printf("Sandbox directory: %s\n", BOX_DIR); - printf("Sandbox credentials: uid=%u gid=%u\n", BOX_UID, BOX_GID); + printf("Process isolator 1.0\n"); + printf("(c) 2012 Martin Mares and Bernard Blackham\n"); + printf("\nCompile-time configuration:\n"); + printf("Sandbox directory: %s\n", CONFIG_ISOLATE_BOX_DIR); + printf("Sandbox credentials: uid=%u-%u gid=%u-%u\n", + CONFIG_ISOLATE_FIRST_UID, + CONFIG_ISOLATE_FIRST_UID + CONFIG_ISOLATE_NUM_BOXES - 1, + CONFIG_ISOLATE_FIRST_GID, + CONFIG_ISOLATE_FIRST_GID + CONFIG_ISOLATE_NUM_BOXES - 1); } -static void -usage(void) +/*** Options ***/ + +static void __attribute__((format(printf,1,2))) +usage(const char *msg, ...) { - fprintf(stderr, "Invalid arguments!\n"); + if (msg != NULL) + { + va_list args; + va_start(args, msg); + vfprintf(stderr, msg, args); + va_end(args); + } printf("\ Usage: isolate [] \n\ \n\ Options:\n\ --e, --full-env\t\tInherit full environment of the parent process\n\ --E, --env=\tInherit the environment variable from the parent process\n\ +-b, --box-id=\tWhen multiple sandboxes are used in parallel, each must get a unique ID\n\ +-c, --cg[=]\tPut process in a control group (optionally a sub-group of )\n\ + --cg-mem=\tLimit memory usage of the control group to KB\n\ + --cg-timing\t\tTime limits affects total run time of the control group\n\ +-d, --dir=\t\tMake a directory visible inside the sandbox\n\ + --dir==\tMake a directory outside visible as inside\n\ + --dir==\t\tDelete a previously defined directory rule (even a default one)\n\ + --dir=...:\tSpecify options for a rule:\n\ +\t\t\t\tdev\tAllow access to special files\n\ +\t\t\t\tfs\tMount a filesystem (e.g., --dir=/proc:proc:fs)\n\ +\t\t\t\tmaybe\tSkip the rule if does not exist\n\ +\t\t\t\tnoexec\tDo not allow execution of binaries\n\ +\t\t\t\trw\tAllow read-write access\n\ +-E, --env=\t\tInherit the environment variable from the parent process\n\ -E, --env==\tSet the environment variable to ; unset it if is empty\n\ --i, --stdin=\tRedirect stdin from \n\ --k, --stack=\tLimit stack size to KB (default: 0=unlimited)\n\ +-x, --extra-time=