X-Git-Url: http://mj.ucw.cz/gitweb/?a=blobdiff_plain;f=isolate%2Fisolate.c;h=b0cbb9aefa23276e312b25c399b0b62b26f4f716;hb=8e14b1a828f7aa8eae6f82f5fc965812628eca04;hp=c36c36bb17a61558fe3ea804bce0551a9b6e085b;hpb=6ebe2d19d215c5599ffbd045be4f2cf3079eea96;p=eval.git diff --git a/isolate/isolate.c b/isolate/isolate.c index c36c36b..b0cbb9a 100644 --- a/isolate/isolate.c +++ b/isolate/isolate.c @@ -1,8 +1,8 @@ /* * A Process Isolator based on Linux Containers * - * (c) 2012 Martin Mares - * (c) 2012 Bernard Blackham + * (c) 2012-2014 Martin Mares + * (c) 2012-2014 Bernard Blackham */ #define _GNU_SOURCE @@ -20,22 +20,23 @@ #include #include #include +#include #include +#include +#include #include #include #include #include #include #include +#include +#include #define NONRET __attribute__((noreturn)) #define UNUSED __attribute__((unused)) #define ARRAY_SIZE(a) (int)(sizeof(a)/sizeof(a[0])) -#define BOX_DIR CONFIG_ISOLATE_BOX_DIR -#define BOX_UID CONFIG_ISOLATE_BOX_UID -#define BOX_GID CONFIG_ISOLATE_BOX_GID - static int timeout; /* milliseconds */ static int wall_timeout; static int extra_timeout; @@ -43,20 +44,27 @@ static int pass_environ; static int verbose; static int memory_limit; static int stack_limit; +static int block_quota; +static int inode_quota; static int max_processes = 1; static char *redir_stdin, *redir_stdout, *redir_stderr; +static char *set_cwd; static int cg_enable; static int cg_memory_limit; static int cg_timing; -static char *cg_root = "/sys/fs/cgroup"; +static int box_id; +static char box_dir[1024]; +static pid_t box_pid; + +static uid_t box_uid; +static gid_t box_gid; static uid_t orig_uid; static gid_t orig_gid; -static pid_t box_pid; static int partial_line; -static char cleanup_cmd[256]; +static int cleanup_ownership; static struct timeval start_time; static int ticks_per_sec; @@ -72,6 +80,8 @@ static void cg_stats(void); static int get_wall_time_ms(void); static int get_run_time_ms(struct rusage *rus); +static void chowntree(char *path, uid_t uid, gid_t gid); + /*** Meta-files ***/ static FILE *metafile; @@ -125,16 +135,6 @@ final_stats(struct rusage *rus) /*** Messages and exits ***/ -static void -xsystem(const char *cmd) -{ - int ret = system(cmd); - if (ret < 0) - die("system(\"%s\"): %m", cmd); - if (!WIFEXITED(ret) || WEXITSTATUS(ret)) - die("system(\"%s\"): Exited with status %d", cmd, ret); -} - static void NONRET box_exit(int rc) { @@ -155,8 +155,10 @@ box_exit(int rc) final_stats(&rus); } - if (rc < 2 && cleanup_cmd[0]) - xsystem(cleanup_cmd); + if (rc < 2 && cleanup_ownership) + { + chowntree("box", orig_uid, orig_gid); + } meta_close(); exit(rc); @@ -258,6 +260,47 @@ static int dir_exists(char *path) return (stat(path, &st) >= 0 && S_ISDIR(st.st_mode)); } +static int rmtree_helper(const char *fpath, const struct stat *sb, + int typeflag, struct FTW *ftwbuf) +{ + if (S_ISDIR(sb->st_mode)) + { + if (rmdir(fpath) < 0) + die("Cannot rmdir %s: %m", fpath); + } + else + { + if (unlink(fpath) < 0) + die("Cannot unlink %s: %m", fpath); + } + return FTW_CONTINUE; +} + +static void +rmtree(char *path) +{ + nftw(path, rmtree_helper, 32, FTW_MOUNT | FTW_PHYS | FTW_DEPTH); +} + +static uid_t chown_uid; +static gid_t chown_gid; +static int chowntree_helper(const char *fpath, const struct stat *sb, + int typeflag, struct FTW *ftwbuf) +{ + if (lchown(fpath, chown_uid, chown_gid) < 0) + die("Cannot chown %s: %m", fpath); + else + return FTW_CONTINUE; +} + +static void +chowntree(char *path, uid_t uid, gid_t gid) +{ + chown_uid = uid; + chown_gid = gid; + nftw(path, chowntree_helper, 32, FTW_MOUNT | FTW_PHYS); +} + /*** Environment rules ***/ struct env_rule { @@ -393,7 +436,7 @@ setup_environment(void) return env; } -/*** Mount rules ***/ +/*** Directory rules ***/ struct dir_rule { char *inside; // A relative path @@ -506,7 +549,8 @@ static void init_dir_rules(void) static void make_dir(char *path) { - char *sep = path; + char *sep = (path[0] == '/' ? path+1 : path); + for (;;) { sep = strchr(sep, '/'); @@ -572,12 +616,52 @@ static void apply_dir_rules(void) /*** Control groups ***/ -static char cg_path[256]; +struct cg_controller_desc { + const char *name; + int optional; +}; + +typedef enum { + CG_MEMORY = 0, + CG_CPUACCT, + CG_CPUSET, + CG_NUM_CONTROLLERS, +} cg_controller; + +static const struct cg_controller_desc cg_controllers[CG_NUM_CONTROLLERS+1] = { + [CG_MEMORY] = { "memory", 0 }, + [CG_CPUACCT] = { "cpuacct", 0 }, + [CG_CPUSET] = { "cpuset", 1 }, + [CG_NUM_CONTROLLERS] = { NULL, 0 }, +}; + +#define FOREACH_CG_CONTROLLER(_controller) \ + for (cg_controller (_controller) = 0; \ + (_controller) < CG_NUM_CONTROLLERS; (_controller)++) + +static const char *cg_controller_name(cg_controller c) +{ + return cg_controllers[c].name; +} + +static const int cg_controller_optional(cg_controller c) +{ + return cg_controllers[c].optional; +} + +static char cg_name[256]; #define CG_BUFSIZE 1024 +static void +cg_makepath(char *buf, size_t len, cg_controller c, const char *attr) +{ + const char *cg_root = CONFIG_ISOLATE_CGROUP_ROOT; + snprintf(buf, len, "%s/%s/%s/%s", cg_root, cg_controller_name(c), cg_name, attr); +} + static int -cg_read(char *attr, char *buf) +cg_read(cg_controller controller, const char *attr, char *buf) { int maybe = 0; if (attr[0] == '?') @@ -587,7 +671,7 @@ cg_read(char *attr, char *buf) } char path[256]; - snprintf(path, sizeof(path), "%s/%s", cg_path, attr); + cg_makepath(path, sizeof(path), controller, attr); int fd = open(path, O_RDONLY); if (fd < 0) @@ -599,7 +683,11 @@ cg_read(char *attr, char *buf) int n = read(fd, buf, CG_BUFSIZE); if (n < 0) - die("Cannot read %s: %m", path); + { + if (maybe) + return 0; + die("Cannot read %s: %m", path); + } if (n >= CG_BUFSIZE - 1) die("Attribute %s too long", path); if (n > 0 && buf[n-1] == '\n') @@ -613,30 +701,47 @@ cg_read(char *attr, char *buf) return 1; } -static void __attribute__((format(printf,2,3))) -cg_write(char *attr, char *fmt, ...) +static void __attribute__((format(printf,3,4))) +cg_write(cg_controller controller, const char *attr, const char *fmt, ...) { + int maybe = 0; + if (attr[0] == '?') + { + attr++; + maybe = 1; + } + va_list args; va_start(args, fmt); char buf[CG_BUFSIZE]; int n = vsnprintf(buf, sizeof(buf), fmt, args); if (n >= CG_BUFSIZE) - die("cg_writef: Value for attribute %s is too long", attr); + die("cg_write: Value for attribute %s is too long", attr); if (verbose > 1) msg("CG: Write %s = %s", attr, buf); char path[256]; - snprintf(path, sizeof(path), "%s/%s", cg_path, attr); + cg_makepath(path, sizeof(path), controller, attr); int fd = open(path, O_WRONLY | O_TRUNC); if (fd < 0) - die("Cannot write %s: %m", path); + { + if (maybe) + return; + else + die("Cannot write %s: %m", path); + } int written = write(fd, buf, n); if (written < 0) - die("Cannot set %s to %s: %m", path, buf); + { + if (maybe) + return; + else + die("Cannot set %s to %s: %m", path, buf); + } if (written != n) die("Short write to %s (%d out of %d bytes)", path, written, n); @@ -650,11 +755,12 @@ cg_init(void) if (!cg_enable) return; + char *cg_root = CONFIG_ISOLATE_CGROUP_ROOT; if (!dir_exists(cg_root)) die("Control group filesystem at %s not mounted", cg_root); - snprintf(cg_path, sizeof(cg_path), "%s/box-%d", cg_root, BOX_UID); - msg("Using control group %s\n", cg_path); + snprintf(cg_name, sizeof(cg_name), "box-%d", box_id); + msg("Using control group %s\n", cg_name); } static void @@ -665,22 +771,27 @@ cg_prepare(void) struct stat st; char buf[CG_BUFSIZE]; + char path[256]; - if (stat(cg_path, &st) >= 0 || errno != ENOENT) + FOREACH_CG_CONTROLLER(controller) { - msg("Control group %s already exists, trying to empty it.\n", cg_path); - if (rmdir(cg_path) < 0) - die("Failed to reset control group %s: %m", cg_path); - } + cg_makepath(path, sizeof(path), controller, ""); + if (stat(path, &st) >= 0 || errno != ENOENT) + { + msg("Control group %s already exists, trying to empty it.\n", path); + if (rmdir(path) < 0) + die("Failed to reset control group %s: %m", path); + } - if (mkdir(cg_path, 0777) < 0) - die("Failed to create control group %s: %m", cg_path); + if (mkdir(path, 0777) < 0 && !cg_controller_optional(controller)) + die("Failed to create control group %s: %m", path); + } // If cpuset module is enabled, copy allowed cpus and memory nodes from parent group - if (cg_read("?../cpuset.cpus", buf)) - cg_write("cpuset.cpus", "%s", buf); - if (cg_read("?../cpuset.mems", buf)) - cg_write("cpuset.mems", "%s", buf); + if (cg_read(CG_CPUSET, "?cpuset.cpus", buf)) + cg_write(CG_CPUSET, "cpuset.cpus", "%s", buf); + if (cg_read(CG_CPUSET, "?cpuset.mems", buf)) + cg_write(CG_CPUSET, "cpuset.mems", "%s", buf); } static void @@ -689,22 +800,24 @@ cg_enter(void) if (!cg_enable) return; - msg("Entering control group %s\n", cg_path); + msg("Entering control group %s\n", cg_name); - struct stat st; - if (stat(cg_path, &st) < 0) - die("Control group %s does not exist: %m", cg_path); + FOREACH_CG_CONTROLLER(controller) + { + if (cg_controller_optional(controller)) + cg_write(controller, "?tasks", "%d\n", (int) getpid()); + else + cg_write(controller, "tasks", "%d\n", (int) getpid()); + } if (cg_memory_limit) { - cg_write("memory.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); - cg_write("memory.memsw.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); + cg_write(CG_MEMORY, "memory.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); + cg_write(CG_MEMORY, "?memory.memsw.limit_in_bytes", "%lld\n", (long long) cg_memory_limit << 10); } if (cg_timing) - cg_write("cpuacct.usage", "0\n"); - - cg_write("tasks", "%d\n", (int) getpid()); + cg_write(CG_CPUACCT, "cpuacct.usage", "0\n"); } static int @@ -714,7 +827,7 @@ cg_get_run_time_ms(void) return 0; char buf[CG_BUFSIZE]; - cg_read("cpuacct.usage", buf); + cg_read(CG_CPUACCT, "cpuacct.usage", buf); unsigned long long ns = atoll(buf); return ns / 1000000; } @@ -729,9 +842,9 @@ cg_stats(void) // Memory usage statistics unsigned long long mem=0, memsw=0; - if (cg_read("?memory.max_usage_in_bytes", buf)) + if (cg_read(CG_MEMORY, "?memory.max_usage_in_bytes", buf)) mem = atoll(buf); - if (cg_read("?memory.memsw.max_usage_in_bytes", buf)) + if (cg_read(CG_MEMORY, "?memory.memsw.max_usage_in_bytes", buf)) { memsw = atoll(buf); if (memsw > mem) @@ -749,12 +862,104 @@ cg_remove(void) if (!cg_enable) return; - cg_read("tasks", buf); - if (buf[0]) - die("Some tasks left in control group %s, failed to remove it", cg_path); + FOREACH_CG_CONTROLLER(controller) + { + if (cg_controller_optional(controller)) { + if (!cg_read(controller, "?tasks", buf)) + continue; + } else + cg_read(controller, "tasks", buf); + + if (buf[0]) + die("Some tasks left in controller %s of cgroup %s, failed to remove it", + cg_controller_name(controller), cg_name); + + char path[256]; + cg_makepath(path, sizeof(path), controller, ""); - if (rmdir(cg_path) < 0) - die("Cannot remove control group %s: %m", cg_path); + if (rmdir(path) < 0) + die("Cannot remove control group %s: %m", path); + } +} + +/*** Disk quotas ***/ + +static int +path_begins_with(char *path, char *with) +{ + while (*with) + if (*path++ != *with++) + return 0; + return (!*with || *with == '/'); +} + +static char * +find_device(char *path) +{ + FILE *f = setmntent("/proc/mounts", "r"); + if (!f) + die("Cannot open /proc/mounts: %m"); + + struct mntent *me; + int best_len = 0; + char *best_dev = NULL; + while (me = getmntent(f)) + { + if (!path_begins_with(me->mnt_fsname, "/dev")) + continue; + if (path_begins_with(path, me->mnt_dir)) + { + int len = strlen(me->mnt_dir); + if (len > best_len) + { + best_len = len; + free(best_dev); + best_dev = xstrdup(me->mnt_fsname); + } + } + } + endmntent(f); + return best_dev; +} + +static void +set_quota(void) +{ + if (!block_quota) + return; + + char cwd[PATH_MAX]; + if (!getcwd(cwd, sizeof(cwd))) + die("getcwd: %m"); + + char *dev = find_device(cwd); + if (!dev) + die("Cannot identify filesystem which contains %s", cwd); + msg("Quota: Mapped path %s to a filesystem on %s\n", cwd, dev); + + // Sanity check + struct stat dev_st, cwd_st; + if (stat(dev, &dev_st) < 0) + die("Cannot identify block device %s: %m", dev); + if (!S_ISBLK(dev_st.st_mode)) + die("Expected that %s is a block device", dev); + if (stat(".", &cwd_st) < 0) + die("Cannot stat cwd: %m"); + if (cwd_st.st_dev != dev_st.st_rdev) + die("Identified %s as a filesystem on %s, but it is obviously false", cwd, dev); + + struct dqblk dq = { + .dqb_bhardlimit = block_quota, + .dqb_bsoftlimit = block_quota, + .dqb_ihardlimit = inode_quota, + .dqb_isoftlimit = inode_quota, + .dqb_valid = QIF_LIMITS, + }; + if (quotactl(QCMD(Q_SETQUOTA, USRQUOTA), dev, box_uid, (caddr_t) &dq) < 0) + die("Cannot set disk quota: %m"); + msg("Quota: Set block quota %d and inode quota %d\n", block_quota, inode_quota); + + free(dev); } /*** The keeper process ***/ @@ -972,11 +1177,11 @@ setup_root(void) static void setup_credentials(void) { - if (setresgid(BOX_GID, BOX_GID, BOX_GID) < 0) + if (setresgid(box_gid, box_gid, box_gid) < 0) die("setresgid: %m"); if (setgroups(0, NULL) < 0) die("setgroups: %m"); - if (setresuid(BOX_UID, BOX_UID, BOX_UID) < 0) + if (setresuid(box_uid, box_uid, box_uid) < 0) die("setresuid: %m"); setpgrp(); } @@ -1046,23 +1251,43 @@ box_inside(void *arg) setup_rlimits(); char **env = setup_environment(); + if (set_cwd && chdir(set_cwd)) + die("chdir: %m"); + execve(args[0], args, env); die("execve(\"%s\"): %m", args[0]); } +static void +box_init(void) +{ + if (box_id < 0 || box_id >= CONFIG_ISOLATE_NUM_BOXES) + die("Sandbox ID out of range (allowed: 0-%d)", CONFIG_ISOLATE_NUM_BOXES-1); + box_uid = CONFIG_ISOLATE_FIRST_UID + box_id; + box_gid = CONFIG_ISOLATE_FIRST_GID + box_id; + + snprintf(box_dir, sizeof(box_dir), "%s/%d", CONFIG_ISOLATE_BOX_DIR, box_id); + make_dir(box_dir); + if (chdir(box_dir) < 0) + die("chdir(%s): %m", box_dir); +} + /*** Commands ***/ static void init(void) { msg("Preparing sandbox directory\n"); - xsystem("rm -rf box"); + rmtree("box"); if (mkdir("box", 0700) < 0) die("Cannot create box: %m"); if (chown("box", orig_uid, orig_gid) < 0) die("Cannot chown box: %m"); cg_prepare(); + set_quota(); + + puts(box_dir); } static void @@ -1072,7 +1297,7 @@ cleanup(void) die("Box directory not found, there isn't anything to clean up"); msg("Deleting sandbox directory\n"); - xsystem("rm -rf box"); + rmtree(box_dir); cg_remove(); } @@ -1082,10 +1307,8 @@ run(char **argv) if (!dir_exists("box")) die("Box directory not found, did you run `isolate --init'?"); - char cmd[256]; - snprintf(cmd, sizeof(cmd), "chown -R %d.%d box", BOX_UID, BOX_GID); - xsystem(cmd); - snprintf(cleanup_cmd, sizeof(cleanup_cmd), "chown -R %d.%d box", orig_uid, orig_gid); + chowntree("box", box_uid, box_gid); + cleanup_ownership = 1; if (pipe(error_pipes) < 0) die("pipe: %m"); @@ -1112,20 +1335,31 @@ show_version(void) printf("Process isolator 1.0\n"); printf("(c) 2012 Martin Mares and Bernard Blackham\n"); printf("\nCompile-time configuration:\n"); - printf("Sandbox directory: %s\n", BOX_DIR); - printf("Sandbox credentials: uid=%u gid=%u\n", BOX_UID, BOX_GID); + printf("Sandbox directory: %s\n", CONFIG_ISOLATE_BOX_DIR); + printf("Sandbox credentials: uid=%u-%u gid=%u-%u\n", + CONFIG_ISOLATE_FIRST_UID, + CONFIG_ISOLATE_FIRST_UID + CONFIG_ISOLATE_NUM_BOXES - 1, + CONFIG_ISOLATE_FIRST_GID, + CONFIG_ISOLATE_FIRST_GID + CONFIG_ISOLATE_NUM_BOXES - 1); } /*** Options ***/ -static void -usage(void) +static void __attribute__((format(printf,1,2))) +usage(const char *msg, ...) { - fprintf(stderr, "Invalid arguments!\n"); + if (msg != NULL) + { + va_list args; + va_start(args, msg); + vfprintf(stderr, msg, args); + va_end(args); + } printf("\ Usage: isolate [] \n\ \n\ Options:\n\ +-b, --box-id=\tWhen multiple sandboxes are used in parallel, each must get a unique ID\n\ -c, --cg[=]\tPut process in a control group (optionally a sub-group of )\n\ --cg-mem=\tLimit memory usage of the control group to KB\n\ --cg-timing\t\tTime limits affects total run time of the control group\n\ @@ -1145,6 +1379,7 @@ Options:\n\ -e, --full-env\t\tInherit full environment of the parent process\n\ -m, --mem=\tLimit address space to KB\n\ -M, --meta=\tOutput process information to (name:value)\n\ +-q, --quota=,\tSet disk quota to blocks and inodes\n\ -k, --stack=\tLimit stack size to KB (default: 0=unlimited)\n\ -r, --stderr=\tRedirect stderr to \n\ -i, --stdin=\tRedirect stdin from \n\ @@ -1168,14 +1403,17 @@ enum opt_code { OPT_RUN, OPT_CLEANUP, OPT_VERSION, + OPT_CG, OPT_CG_MEM, OPT_CG_TIMING, }; -static const char short_opts[] = "c::d:eE:i:k:m:M:o:p::r:t:vw:x:"; +static const char short_opts[] = "b:c:d:eE:i:k:m:M:o:p::q:r:t:vw:x:"; static const struct option long_opts[] = { - { "cg", 2, NULL, 'c' }, + { "box-id", 1, NULL, 'b' }, + { "chdir", 1, NULL, 'c' }, + { "cg", 0, NULL, OPT_CG }, { "cg-mem", 1, NULL, OPT_CG_MEM }, { "cg-timing", 0, NULL, OPT_CG_TIMING }, { "cleanup", 0, NULL, OPT_CLEANUP }, @@ -1187,6 +1425,7 @@ static const struct option long_opts[] = { { "mem", 1, NULL, 'm' }, { "meta", 1, NULL, 'M' }, { "processes", 2, NULL, 'p' }, + { "quota", 1, NULL, 'q' }, { "run", 0, NULL, OPT_RUN }, { "stack", 1, NULL, 'k' }, { "stderr", 1, NULL, 'r' }, @@ -1203,6 +1442,7 @@ int main(int argc, char **argv) { int c; + char *sep; enum opt_code mode = 0; init_dir_rules(); @@ -1210,21 +1450,25 @@ main(int argc, char **argv) while ((c = getopt_long(argc, argv, short_opts, long_opts, NULL)) >= 0) switch (c) { + case 'b': + box_id = atoi(optarg); + break; case 'c': - if (optarg) - cg_root = optarg; + set_cwd = optarg; + break; + case OPT_CG: cg_enable = 1; break; case 'd': if (!set_dir_action(optarg)) - usage(); + usage("Invalid directory specified: %s\n", optarg); break; case 'e': pass_environ = 1; break; case 'E': if (!set_env_action(optarg)) - usage(); + usage("Invalid environment specified: %s\n", optarg); break; case 'k': stack_limit = atoi(optarg); @@ -1247,6 +1491,13 @@ main(int argc, char **argv) else max_processes = 0; break; + case 'q': + sep = strchr(optarg, ','); + if (!sep) + usage("Invalid quota specified: %s\n", optarg); + block_quota = atoi(optarg); + inode_quota = atoi(sep+1); + break; case 'r': redir_stderr = optarg; break; @@ -1266,7 +1517,10 @@ main(int argc, char **argv) case OPT_RUN: case OPT_CLEANUP: case OPT_VERSION: - mode = c; + if (!mode || mode == c) + mode = c; + else + usage("Only one command is allowed.\n"); break; case OPT_CG_MEM: cg_memory_limit = atoi(optarg); @@ -1275,11 +1529,11 @@ main(int argc, char **argv) cg_timing = 1; break; default: - usage(); + usage(NULL); } if (!mode) - usage(); + usage("Please specify an isolate command (e.g. --init, --run).\n"); if (mode == OPT_VERSION) { show_version(); @@ -1292,25 +1546,24 @@ main(int argc, char **argv) orig_gid = getgid(); umask(022); - if (chdir(BOX_DIR) < 0) - die("chdir(%s): %m", BOX_DIR); + box_init(); cg_init(); switch (mode) { case OPT_INIT: if (optind < argc) - usage(); + usage("--init mode takes no parameters\n"); init(); break; case OPT_RUN: if (optind >= argc) - usage(); + usage("--run mode requires a command to run\n"); run(argv+optind); break; case OPT_CLEANUP: if (optind < argc) - usage(); + usage("--cleanup mode takes no parameters\n"); cleanup(); break; default: