diff --git a/.gitignore b/.gitignore
index 9c849e3..8a7a88c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 *.jl.*.cov
 *.jl.mem
 global_prefix
+.vscode/
diff --git a/.travis.yml b/.travis.yml
index 22d420f..7f35275 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -23,7 +23,8 @@ env:
   matrix:
     - BINARYBUILDER_USE_SQUASHFS=true
     - BINARYBUILDER_USE_SQUASHFS=false
-    
+    - BINARYBUILDER_RUNNER=privileged
+
 cache:
   directories:
     - deps/downloads
diff --git a/deps/sandbox.c b/deps/sandbox.c
index 17740f0..7b2f8e2 100644
--- a/deps/sandbox.c
+++ b/deps/sandbox.c
@@ -1,6 +1,39 @@
-/* Copyright (c) 2016 Julia Computing Inc */
+/* Copyright (c) 2017 Julia Computing Inc */
 #define _GNU_SOURCE
 
+/*
+  sandbox.c - Combination sandbox execution platform and init replacement
+
+This file serves as the entrypoint into our sandboxed/virtualized execution environment for
+BinaryBuilder.jl; it has three execution modes:
+
+  1) Unprivileged container mode.
+  2) Privileged container mode.
+  3) Init mode.
+
+Each mode does similar things, but in a different order and with different privileges. Eventually,
+all modes seek the same result; to run a user program with the base root fs and any other shards
+requested by the user within the BinaryBuilder.jl execution environment. We will walk through the
+three modes here, to explain what each does.
+
+* Unprivileged container mode is the "normal" mode of execution; it attempts to use the native
+kernel namespace abilities to setup its environment without ever needing to be `root`. It does this
+by creating a user namespace, then using its root privileges within the namespace to mount the
+necesary shards, `chroot`, etc... within the right places in the new mount namespace created within
+the container.
+
+* Privileged container mode is what happens when `sandbox` is invoked with EUID == 0.  In this
+mode, the mounts and chroots and whatnot are performed _before_ creating a new user namespace.
+This is used as a workaround for kernels that do not have the capabilities for creating mounts
+within user namespaces.  Arch Linux is a great example of this.
+
+Init mode is used when `sandbox` is invoked with PID == 1.  In this mode, some extra work needs to
+happen first as this sandbox is the first user program running on a virtualized system, e.g. inside
+of QEMU, and it needs to setup the plan 9 filesystem mounts and whatnot.  There is no
+containerization or namespaces that happen in this mode.
+*/
+
+
 /* Seperate because the headers below don't have all dependencies properly
    declared */
 #include <sys/socket.h>
@@ -38,12 +71,38 @@
 #include <linux/limits.h>
 #include <getopt.h>
 
+/**** Global Variables ***/
+
+// TODO: NABIL: Explain what these are better 
+char *sandbox_root = NULL;
+char *workspace = NULL;
+char *new_cd = NULL;
+unsigned char verbose = 0;
+
+// Linked list of volume mappings
+struct map_list {
+    char *map_path;
+    char *outside_path;
+    struct map_list *prev;
+};
+struct map_list *maps;
+
+// This keeps track of our execution mode
+enum {
+  UNPRIVILEGED_CONTAINER_MODE,
+  PRIVILEGED_CONTAINER_MODE,
+  INIT_MODE,
+};
+static int execution_mode;
+
+
+
 /**** General Utilities ***/
 
 /* Like assert, but don't go away with optimizations */
 static void _check(int ok, int line) {
   if (!ok) {
-    printf("At line %d, ABORTED (%s)!\n", line, strerror(errno));
+    fprintf(stderr, "At line %d, ABORTED (%s)!\n", line, strerror(errno));
     abort();
   }
 }
@@ -51,7 +110,7 @@ static void _check(int ok, int line) {
 
 /* Opens /proc/%pid/%file */
 static int open_proc_file(pid_t pid, const char *file, int mode) {
-  char path[100];
+  char path[PATH_MAX];
   int n = snprintf(path, sizeof(path), "/proc/%d/%s", pid, file);
   check(n >= 0 && n < sizeof(path));
   int fd = open(path, mode);
@@ -61,26 +120,32 @@ static int open_proc_file(pid_t pid, const char *file, int mode) {
 
 /**** 2: User namespaces
  *
- * For a general overview on user namespaces, see the corresponding manual page,
+ * For a general overview on user namespaces, see the corresponding manual page
  * user_namespaces(7). In general, user namespaces allow unprivileged users to
  * run privileged executables, by rewriting uids inside the namespaces (and
  * in particular, a user can be root inside the namespace, but not outside),
- * with the kernel still enforcing, access protection as if the user was
+ * with the kernel still enforcing access protection as if the user was
  * unprivilged (to all files and resources not created exclusively within the
  * namespace). Absent kernel bugs, this provides relatively strong protections
  * against misconfiguration (because no true privilege is ever bestowed upon
  * the sandbox). It should be noted however, that there were such kernel bugs
- * as recently as Feb 2016, so it is imperative that this is run on a recent and
- * fully patched kernel.
+ * as recently as Feb 2016.  These were sneaky privilege escalation bugs,
+ * rather unimportant to the use case of BinaryBuilder, but a recent and fully
+ * patched kernel should be considered essential for any security-sensitive
+ * work done on top of this infrastructure).
  */
-static void configure_user_namespace(pid_t pid) {
+static void configure_user_namespace(uid_t uid, gid_t gid, pid_t pid) {
   int nbytes = 0;
 
+  if (verbose) {
+    printf("--> Mapping %d:%d to root:root within container namespace\n", uid, gid);
+  }
+
   // Setup uid map
   int uidmap_fd = open_proc_file(pid, "uid_map", O_WRONLY);
   check(uidmap_fd != -1);
   char uidmap[100];
-  nbytes = snprintf(uidmap, sizeof(uidmap), "0\t%d\t1", getuid());
+  nbytes = snprintf(uidmap, sizeof(uidmap), "0\t%d\t1", uid);
   check(nbytes > 0 && nbytes <= sizeof(uidmap));
   check(write(uidmap_fd, uidmap, nbytes) == nbytes);
   close(uidmap_fd);
@@ -95,188 +160,257 @@ static void configure_user_namespace(pid_t pid) {
   int gidmap_fd = open_proc_file(pid, "gid_map", O_WRONLY);
   check(gidmap_fd != -1);
   char gidmap[100];
-  nbytes = snprintf(gidmap, sizeof(gidmap), "0\t%d\t1", getgid());
+  nbytes = snprintf(gidmap, sizeof(gidmap), "0\t%d\t1", gid);
   check(nbytes > 0 && nbytes <= sizeof(gidmap));
   check(write(gidmap_fd, gidmap, nbytes) == nbytes);
 }
 
-// Options (gets filled in by driver code)
-char *sandbox_root = NULL;
-char *overlay = NULL;
-char *overlay_workdir = NULL;
-char *workspace = NULL;
-char *new_cd = NULL;
-unsigned char verbose = 0;
 
-struct map_list {
-    char *map_path;
-    char *outside_path;
-    struct map_list *prev;
-};
-
-struct map_list *maps;
-
-/* Mount an overlayfs on "overlayfs_root", anchoring the changes within the
- * temporary folders within /proc/upper and /proc/work created by
- * sandbox_main()
+/*
+ * Mount an overlayfs from `src` onto `dest`, anchoring the changes made to the overlayfs
+ * within the folders `root_dir`/upper and `root_dir`/work.  Note that the common case of
+ * `src` == `dest` signifies that we "shadow" the original source location and will simply
+ * discard any changes made to it when the overlayfs disappears.  This is how we protect our
+ * rootfs and shards when mounting from a local filesystem, as well as how we convert a
+ * read-only rootfs and shards to a read-write system when mounting from squashfs images.
  */
-static void create_overlay(const char * overlay_root, const char *mount_point,
-                           const char * bname) {
-    char upper_dir[PATH_MAX], work_dir[PATH_MAX], opts[3*PATH_MAX+40];
-    if (!bname) {
-        bname = basename(overlay_root);
-    }
-    if (!mount_point) {
-        mount_point = overlay_root;
-    }
+static void mount_overlay(const char * src, const char * dest, const char * bname,
+                          const char * work_dir, uid_t uid, gid_t gid) {
+  char upper[PATH_MAX], work[PATH_MAX], opts[3*PATH_MAX+28];
+
+  // Construct the location of our upper and work directories
+  snprintf(upper, sizeof(upper), "%s/upper/%s", work_dir, bname);
+  snprintf(work, sizeof(work), "%s/work/%s", work_dir, bname);
+
+  // If `src` is "", we actually want it to be "/", so adapt here because this is the
+  // only place in the code base where we actually need the slash at the end of the
+  // directory name.
+  if (src[0] == '\0') {
+    src = "/";
+  }
+
+  if (verbose) {
+    printf("--> Mounting overlay of %s at %s (modifications in %s)\n", src, dest, upper);
+  }
 
-    snprintf(upper_dir, sizeof(upper_dir), "/proc/upper/%s", bname);
-    snprintf(work_dir, sizeof(work_dir), "/proc/work/%s", bname);
-    snprintf(opts, sizeof(opts), "lowerdir=%s,upperdir=%s,workdir=%s",
-             overlay_root, upper_dir, work_dir);
+  // Make the upper and work directories
+  check(0 == mkdir(upper, 0777));
+  check(0 == mkdir(work, 0777));
 
-    if (verbose) {
-        printf("--> Mounting overlay of %s at %s (modifications in %s)\n", overlay_root, mount_point, upper_dir);
-    }
+  // Construct the opts, mount the overlay
+  snprintf(opts, sizeof(opts), "lowerdir=%s,upperdir=%s,workdir=%s", src, upper, work);
+  check(0 == mount("overlay", dest, "overlay", 0, opts));
 
-    check(0 == mkdir(upper_dir, 0777));
-    check(0 == mkdir(work_dir, 0777));
-    check(0 == mount("overlay", mount_point, "overlay", 0, opts));
+  // Chown this directory to the desired UID/GID, so that it doesn't look like it's
+  // owned by "nobody" when we're inside the sandbox
+  check(0 == chown(dest, uid, gid));
 }
 
-/* This is the main pid. We exit the sandbox once this pid dies */
-pid_t main_pid;
+static void mount_overlaywork(const char * work_dir) {
+  char path[PATH_MAX];
 
-/*
- * We support running this binary either standalone (which will create a
- * user namespace sandbox) or as init inside a VM.
- */
-static int is_init;
-
-static void early_fs_mount() {
-  check(0 == mount("proc", "/proc", "proc", 0, ""));
+  if (verbose) {
+    printf("--> Creating overlay workdir at %s\n", work_dir);
+  }
+  check(0 == mount("tmpfs", work_dir, "tmpfs", 0, "size=1G"));
+
+  // Create "upper" and "work" directories within this temporary filesystem
+  // to hold the modifications and temporary data the overlayfs filesystems
+  // will require.  We don't care about these modifications, because these
+  // are the modifications that will be created by misbehaving programs that
+  // install things into the root directory (or other shards).  The actual
+  // workspace overlayfs will have a different upper/work setup.
+  snprintf(path, sizeof(path), "%s/upper", work_dir);
+  check(0 == mkdir(path, 0777));
+  snprintf(path, sizeof(path), "%s/work", work_dir);
+  check(0 == mkdir(path, 0777));
 }
 
-static void devtempfs_mount() {
-  check(0 == mount("devtmpfs", "dev", "devtmpfs", 0, ""));
-  check(0 == mkdir("dev/pts", 0600));
-  //int fd = open("dev/ptmx", O_CREAT);
-  //check(fd != -1);
-  //check(0 == close(fd));
-}
+static void mount_procfs(const char * root_dir) {
+  char path[PATH_MAX];
 
-static void all_fs_mount() {
-  early_fs_mount();
-  //check(0 == mount("sandboxpts", "/dev/pts", "devpts", 0, "newinstance"));
-  //check(0 == mount("/dev/pts/ptmx", "/dev/ptmx", "", MS_BIND, NULL));
+  // Mount procfs at /proc
+  snprintf(path, sizeof(path), "%s/proc", root_dir);
+  if (verbose) {
+    printf("--> Mounting procfs at %s\n", path);
+  }
+  check(0 == mount("proc", path, "proc", 0, ""));
 }
 
-/* Sets up the jail, prepares the initial linux environment,
-   then execs busybox */
-static int sandbox_main(int sandbox_argc, char **sandbox_argv) {
-  pid_t pid;
-  int status;
-  check(is_init || sandbox_root != NULL);
-
-  /// Set up a temporary file system to use to hold all the upper dirs for our
-  /// overlay.  We re-use /proc outside the chroot for this purpose, because
-  /// it's a directory that is required to exist for the sandbox to work and
-  /// is not otherwise accessed.
-  check(0 == mount("tmpfs", "/proc", "tmpfs", 0, "size=1G"));
-  check(0 == mkdir("/proc/upper", 0777));
-  check(0 == mkdir("/proc/work", 0777));
-
-  char *rootfs_mount_point = sandbox_root;
-  if (is_init) {
-    sandbox_root = "/";
-    rootfs_mount_point = "/proc";
-    create_overlay("/", "/tmp", "root");
-    chdir("/tmp");
-  } else {
-    create_overlay(sandbox_root, NULL, NULL);
-    chdir(sandbox_root);
-  }
+/*
+ * We use this method to get /dev in shape.  If we're running as init, we need to
+ * mount full-blown devtmpfs at /dev.  If we're just a sandbox, we only bindmount
+ * /dev/null into our root_dir.
+ */
+static void mount_dev(const char * root_dir) {
+  char path[PATH_MAX];
 
-  /// Setup the workspace
-  if (workspace) {
-    // If the workspace is specified as 9p:, try to mount it as a 9p share
-    if (strncmp("9p:", workspace, 3) == 0) {
-      check(0 == mount(workspace+3, "workspace", "9p", 0, "trans=virtio,version=9p2000.L"));
-    } else {
-      // We don't expect workspace to have any submounts in normal operation.
-      // However, for runshell(), workspace could be an arbitrary directory,
-      // including one with sub-mounts, so allow that situation.
-      check(0 == mount(workspace, "workspace", "", MS_BIND|MS_REC, NULL));
+  // Mount devtmps at /dev
+  if (execution_mode == INIT_MODE) {
+    snprintf(path, sizeof(path), "%s/dev", root_dir);
+    if (verbose) {
+      printf("--> Mounting /dev at %s\n", path);
     }
+    check(0 == mount("devtmpfs", path, "devtmpfs", 0, ""));
+
+    // Create /dev/pts directory
+    snprintf(path, sizeof(path), "%s/dev/pts", root_dir);
+    check(0 == mkdir(path, 0600));
+  } else {
+    // Bindmount /dev/null into our root_dir
+    snprintf(path, sizeof(path), "%s/dev/null", root_dir);
+    if (verbose) {
+      printf("--> Mounting /dev/null at %s\n", path);
+    }
+    check(0 == mount("/dev/null", path, "", MS_BIND, NULL));
   }
+}
+
+static void mount_workspace(const char * root_dir, const char * workspace) {
+  char path[PATH_MAX];
+
+  // Mount workspace at /workspace
+  snprintf(path, sizeof(path), "%s/workspace", root_dir);
 
-  // In a VM, we may have to mount devices
-  if (is_init) {
-     devtempfs_mount();
+  if (strncmp("9p:", workspace, 3) == 0) {
+    // If we're running as init within QEMU, the workspace is a plan 9 mount
+    check(0 == mount(workspace+3, path, "9p", 0, "trans=virtio,version=9p2000.L"));
   } else {
-     /// Bind host /dev/null in the sandbox
-     check(0 == mount("/dev/null", "dev/null", "", MS_BIND, NULL));
+    // We don't expect workspace to have any submounts in normal operation.
+    // However, for runshell(), workspace could be an arbitrary directory,
+    // including one with sub-mounts, so allow that situation.
+    check(0 == mount(workspace, path, "", MS_BIND | MS_REC, NULL));
   }
+}
+
+/*
+ * This will mount the rootfs and shards within the given root directory.
+ * `root_dir`  is the path where the rootfs is mounted on the outside.
+ * `dest` is the path where the roofs and all should be mounted 
+ * `shard_maps` is the list of mappings that we've been told to mount.
+ */
+static void mount_rootfs_and_shards(const char * root_dir, const char * dest,
+                                    const char * work_dir, struct map_list * shard_maps,
+                                    uid_t uid, gid_t gid) {
+  // The first thing we do is create an overlay mounting sandbox_root into our root_dir.
+  // The meaning of this is different across our different execution modes:
+  //  * Init mode: root_dir is "/", dest is "/tmp" because we need a read-writeable
+  //    rootfs, but it's already mounted as our root.
+  //  * Privileged mode: root_dir is the path to the already loopback-mounted rootfs
+  //    image, we are mounting it as an overlay within `dest`, a new directory that we
+  //    will chroot into, then clone ourselves into a userns within.
+  //  * Unprivileged mode: root_dir is the path to the already loopback-mounted rootfs
+  //    image, we are mounting it as an overlay within `dest`, a new directory that we
+  //    have already entered into within a userns.
+  mount_overlay(root_dir, dest, "rootfs", work_dir, uid, gid);
+
+  // We're definitely gonna do some path manipulation
+  char path[PATH_MAX];
 
   /// Apply command-line specified mounts
-  struct map_list *current_entry = maps;
+  struct map_list *current_entry = shard_maps;
   while (current_entry != NULL) {
-      char *inside = current_entry->map_path;
-      // Take the path relative to sandbox root (i.e. cwd)
-      if (inside[0] == '/') {
-          inside = inside + 1;
-      }
-      if (verbose) {
-          printf("--> Mapping %s to %s\n", inside, current_entry->outside_path);
-      }
-      check((current_entry->outside_path[0] == '/' ||
-            strncmp(current_entry->outside_path, "9p/", 3) == 0) && "Outside path must be absolute or 9p");
-
-      // Create the inside directory, if we need to
-      DIR *d = opendir(inside);
-      if (d == NULL) {
-          check(0 == mkdir(inside, 0777));
-      } else {
-          closedir(d);
-      }
+    char *inside = current_entry->map_path;
+    
+    // Take the path relative to root_dir
+    while (inside[0] == '/') {
+      inside = inside + 1;
+    }
+    snprintf(path, sizeof(path), "%s/%s", dest, inside);
 
-      // If specified as a device, mount as squashfs
-      if (strncmp(current_entry->outside_path, "/dev", 4) == 0) {
-          check(0 == mount(current_entry->outside_path, inside, "squashfs", 0, ""));
-      } else if (strncmp(current_entry->outside_path, "9p/", 3) == 0) {
-          check(0 == mount(current_entry->outside_path+3, inside, "9p", MS_RDONLY, "trans=virtio,version=9p2000.L"));
-      } else {
-          check(0 == mount(current_entry->outside_path, inside, "", MS_BIND, NULL));
-          // Remount to read-only, nodev, suid.
-          // We only really care about read-only, but we need to make sure
-          // to be stricter than our parent mount. If the parent mount is
-          // noexec, we're out of luck, since we do need to execute these
-          // files. However, we don't really have a need for suid (only one
-          // uid) or device files (none in the image), so passing those extra
-          // flags is harmless. If, we ever cared in the future, the thing
-          // to do would be to read /proc/self/fdinfo or the directory, find
-          // the mnt_id and extract the correct flags from /proc/self/mountinfo.
-          check(0 == mount(current_entry->outside_path, inside, "",
-            MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOSUID, NULL));
-      }
+    // Map <inside> to the given outside path
+    if (verbose) {
+      printf("--> Mapping %s to %s\n", current_entry->outside_path, path);
+    }
+
+    // Create the inside directory, not freaking out if it already exists.
+    int result = mkdir(path, 0777);
+    check((0 == result) || (errno == EEXIST));
+
+    if (strncmp(current_entry->outside_path, "/dev", 4) == 0) {
+      // If we're running on QEMU, we pass mounts in as virtual devices, which we
+      // know are always passed-through .squashfs files.
+      check(0 == mount(current_entry->outside_path, path, "squashfs", 0, ""));
+    } else if (strncmp(current_entry->outside_path, "9p/", 3) == 0) {
+      // If we're running on QEMU, we pass in mappings as plan 9 shares
+      check(0 == mount(current_entry->outside_path+3, path, "9p", MS_RDONLY, "trans=virtio,version=9p2000.L"));
+    } else {
+      // If it's a normal directory, just bind mount it in
+      check(0 == mount(current_entry->outside_path, path, "", MS_BIND, NULL));
+      
+      // Remount to read-only, nodev, suid.
+      // We only really care about read-only, but we need to make sure
+      // to be stricter than our parent mount. If the parent mount is
+      // noexec, we're out of luck, since we do need to execute these
+      // files. However, we don't really have a need for suid (only one
+      // uid) or device files (none in the image), so passing those extra
+      // flags is harmless. If, we ever cared in the future, the thing
+      // to do would be to read /proc/self/fdinfo or the directory, find
+      // the mnt_id and extract the correct flags from /proc/self/mountinfo.
+      check(0 == mount(current_entry->outside_path, path, "",
+                       MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOSUID, NULL));
+    }
 
-      // Slap an overlay on top to allow future changes
-      create_overlay(inside, NULL, NULL);
+    // Slap an overlay on top of the inside mapping to allow future changes
+    mount_overlay(path, path, basename(path), work_dir, uid, gid);
 
-      current_entry = current_entry->prev;
+    current_entry = current_entry->prev;
+  }
+}
+
+/*
+ * Helper function that mounts pretty much everything:
+ *   - procfs
+ *   - our overlay work directory
+ *   - the rootfs,
+ *   - the shards
+ *   - the workspace (if given by the user)
+ */
+static void mount_the_world(const char * root_dir, const char * dest,
+                            const char * workspace, struct map_list * shard_maps,
+                            uid_t uid, gid_t gid) {
+  // Mount the place we'll put all our overlay work directories
+  mount_overlaywork("/proc");
+
+  // Next, overlay all the things
+  mount_rootfs_and_shards(root_dir, dest, "/proc", shard_maps, uid, gid);
+  
+  // Mount /proc within the sandbox
+  mount_procfs(dest);
+  
+  // Mount /dev stuff
+  mount_dev(dest);
+
+  // Only try to mount our workspace if it was given to us.
+  if (workspace != NULL) {
+    mount_workspace(dest, workspace);
   }
 
-  /// Enter chroot
+  // Once we're done with that, put /proc back in its place in the big world.
+  mount_procfs("");
+}
+
+/*
+ * Sets up the chroot jail, then executes the target executable.
+ */
+static int sandbox_main(const char * root_dir, const char * new_cd, int sandbox_argc, char **sandbox_argv) {
+  pid_t pid;
+  int status;
+
+  // Enter chroot
+  check(0 == chdir(root_dir));
   check(0 == chroot("."));
 
+  // If we've got a directory to change to, do so
   if (new_cd) {
     check(0 == chdir(new_cd));
   }
 
-  // Set up the environment
-  all_fs_mount();
+  // fflush before forking
   fflush(stdout);
 
+  // When the main pid dies, we exit.
+  pid_t main_pid;
   if ((main_pid = fork()) == 0) {
     if (verbose) {
       printf("About to run `%s` ", sandbox_argv[0]);
@@ -301,7 +435,8 @@ static int sandbox_main(int sandbox_argc, char **sandbox_argv) {
   for (;;) {
     int sig;
     sigwait(&waitset, &sig);
-    size_t reaped_pid;
+    
+    pid_t reaped_pid;
     while ((reaped_pid = waitpid(-1, &status, WNOHANG)) != -1) {
       if (reaped_pid == main_pid) {
         // If it was the main pid that exited, return as well.
@@ -317,6 +452,8 @@ static void print_help() {
   fputs("[--verbose] [--help] <cmd>\n", stderr);
 }
 
+// Helper function to read from the serial file descriptor, blocking until we
+// can read the requested number of bytes.
 void read_blocking(int fd, char * buff, int num_bytes) {
   int bytes_read = 0;
 
@@ -330,6 +467,8 @@ void read_blocking(int fd, char * buff, int num_bytes) {
   }
 }
 
+// We have a special way of reading in arguments when running as init,
+// where we read from a fake serial device.
 static void read_sandbox_args(int fd, int * argc, char *** argv) {
   // First, read the number of sandbox args:
   *argc = 0;
@@ -353,11 +492,14 @@ static void read_sandbox_args(int fd, int * argc, char *** argv) {
   (*argv)[*argc] = NULL;
 }
 
+// We have a special way of creating a useful environment when running as init,
+// where we read in the environment variables from a fake serial device.
 static void read_sandbox_env(int fd) {
+  // Clear the current environment.  No inheriting variables from QEMU!
   clearenv();
+
   int num_env_mappings = 0;
   read_blocking(fd, (char *)&num_env_mappings, sizeof(int));
-
   if (verbose) {
     printf("Reading %d environment mappings\n", num_env_mappings);
   }
@@ -391,28 +533,47 @@ static void read_sandbox_env(int fd) {
   free(env_buff);
 }
 
-/******* Driver Code
- * Not much to see here, just putting it all together.
- */
 static void sigint_handler() { _exit(0); }
 
+/*
+ * Let's get this party started.
+ */
 int main(int sandbox_argc, char **sandbox_argv) {
   int status;
-  pid_t mypid = getpid();
   pid_t pgrp = getpgid(0);
-  is_init = mypid == 1;
   int cmdline_fd = -1;
 
-  if (is_init) {
-    // Mount our file systems right away so we can start using them
-    early_fs_mount();
+  // First, determine our execution mode based on pid and euid
+  if (getpid() == 1) {
+    execution_mode = INIT_MODE;
+  } else if(geteuid() == 0) {
+    execution_mode = PRIVILEGED_CONTAINER_MODE;
+  } else {
+    execution_mode = UNPRIVILEGED_CONTAINER_MODE;
+  }
+
+  uid_t uid = getuid();
+  gid_t gid = getgid();
+  
+  // If we're running inside of `sudo`, we need to grab the UID/GID of the calling user through
+  // environment variables, not using `getuid()` or `getgid()`.  :(
+  const char * SUDO_UID = getenv("SUDO_UID");
+  if (SUDO_UID != NULL && SUDO_UID[0] != '\0') {
+    uid = strtol(SUDO_UID, NULL, 10);
+  }
+  const char * SUDO_GID = getenv("SUDO_GID");
+  if (SUDO_GID != NULL && SUDO_GID[0] != '\0') {
+    gid = strtol(SUDO_GID, NULL, 10);
+  }
 
+  // If we're running in init mode, we need to do some initial startup; we need to mount /proc,
+  // and we need to read in our command line arguments over a virtual serial device, since we
+  // have no other way for Julia to speak to us running inside of qemu.
+  if (execution_mode == INIT_MODE) {
     // Extract our command line from the second serial device created by BinaryBuilder.jl
     const char * comm_dev = "/dev/vport0p1";
     cmdline_fd = open(comm_dev, O_RDONLY);
-    if( cmdline_fd != -1 ) {
-      read_sandbox_args(cmdline_fd, &sandbox_argc, &sandbox_argv);
-    } else {
+    if( cmdline_fd == -1 ) {
       // This is a debugging escape hatch for us developers that aren't clever enough and
       // somehow screw up the Julia <---> qemu <---> sandbox communication channel.
       printf("Running as init but couldn't open %s; entering debugging mode!\n", comm_dev);
@@ -424,6 +585,9 @@ int main(int sandbox_argc, char **sandbox_argv) {
       sandbox_argv[3] = "9p:workspace";
       sandbox_argv[4] = "/bin/bash";
       sandbox_argv[5] = NULL;
+    } else {
+      // If we have a communication channel, then let's read in our argc and argv!
+      read_sandbox_args(cmdline_fd, &sandbox_argc, &sandbox_argv);
     }
   }
 
@@ -447,12 +611,24 @@ int main(int sandbox_argc, char **sandbox_argv) {
       break;
 
     switch( c ) {
+      case '?':
       case 'h':
         print_help();
         return 0;
       case 'v':
         verbose = 1;
-        printf("verbose sandbox enabled\n");
+        printf("verbose sandbox enabled (running in ");
+        switch (execution_mode) {
+          case INIT_MODE:
+            printf("init");
+            break;
+          case UNPRIVILEGED_CONTAINER_MODE:
+            printf("un");
+          case PRIVILEGED_CONTAINER_MODE:
+            printf("privileged container");
+            break;
+        }
+        printf(" mode)\n");
         break;
       case 'r': {
         sandbox_root = strdup(optarg);
@@ -486,14 +662,18 @@ int main(int sandbox_argc, char **sandbox_argv) {
         entry->map_path = strdup(colon + 1);
         entry->outside_path = strndup(optarg, (colon - optarg));
         entry->prev = maps;
-        maps = entry;
+
         if (verbose) {
           printf("Parsed --map as \"%s\" -> \"%s\"\n", entry->outside_path, entry->map_path);
         }
+
+        if ((entry->outside_path[0] != '/') && (strncmp(entry->outside_path, "9p/", 3) != 0)) {
+          printf("ERROR: Outside path \"%s\" must be absolute or 9p!  Ignoring...\n", entry->outside_path);
+          free(entry);
+        } else {
+          maps = entry;
+        }
       } break;
-      case '?':
-        print_help();
-        return 1;
       default:
         fputs("getoptlong defaulted?!\n", stderr);
         return 1;
@@ -505,30 +685,38 @@ int main(int sandbox_argc, char **sandbox_argv) {
   sandbox_argc -= optind;
 
   // If we don't have a command, die
-  if (sandbox_argc == 0 ) {
+  if (sandbox_argc == 0) {
     fputs("No <cmd> given!\n", stderr);
     print_help();
     return 1;
   }
 
   // If we're not init but we haven't been given a sandbox root, die
-  if (!is_init && !sandbox_root) {
+  if (!(execution_mode == INIT_MODE) && !sandbox_root) {
     fputs("--rootfs is required, unless running as init!\n", stderr);
     print_help();
     return 1;
   }
 
-  // If we have a cmdline_fd, then pull out the environment from it now
-  if( cmdline_fd != -1 ) {
+  // If we are running as init, read in our mandated environment variables,
+  // then sub off to sandbox_main and finally reboot
+  if (execution_mode == INIT_MODE) {
     read_sandbox_env(cmdline_fd);
-  }
 
-  // If we are running as init, run sandbox_main then reboot
-  if (is_init) {
+    // Take over the terminal
     setsid();
     ioctl(0, TIOCSCTTY, 1);
-    sandbox_main(sandbox_argc, sandbox_argv);
-	  sync();
+
+    // Let's mount our world.  Since we're running as init, the rootfs is already mounted
+    // at "/", but it's read-only, so we use overlayfs to mount it on "/tmp".  We then
+    // continue to mount our shards within "/tmp".
+    mount_the_world("", "/tmp", workspace, maps, 0, 0);
+
+    // Run sandbox_main to Enter The Sandbox (TM)
+    sandbox_main("/tmp", new_cd, sandbox_argc, sandbox_argv);
+
+    // Don't forget to `sync()` so that we don't lose any pending writes to the filesystem!
+    sync();
 
     // Goodnight, my sweet prince
     check(0 == reboot(RB_POWER_OFF));
@@ -537,19 +725,40 @@ int main(int sandbox_argc, char **sandbox_argv) {
     return 0;
   }
 
-
-  // Use a pipe for synchronization. The regular SIGSTOP method does not work
-  // because container-inits don't receive STOP or KILL signals from within
-  // their own pid namespace.
+  // If we're running in one of the container modes, we're going to syscall() ourselves a
+  // new, cloned process that is in a container process. We will use a pipe for synchronization.
+  // The regular SIGSTOP method does not work because container-inits don't receive STOP or KILL
+  // signals from within their own pid namespace.
   int child_block[2], parent_block[2];
   pipe(child_block);
   pipe(parent_block);
-
   pid_t pid;
-  if ((pid = syscall(SYS_clone, CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWUSER | SIGCHLD,
-                     0, 0, 0, 0)) == 0) {
+
+  // If we are running as a privileged container, we need to build our mount mappings now.
+  if (execution_mode == PRIVILEGED_CONTAINER_MODE) {
+    // We dissociate ourselves from the typical mount namespace.  This gives us the freedom
+    // to start mounting things willy-nilly without mucking up the user's computer.
+    check(0 == unshare(CLONE_NEWNS));
+
+    // Even if we unshare, we might need to mark `/` as private, as systemd often subverts
+    // the kernel's default value of `MS_PRIVATE` on the root mount.  This doesn't affect
+    // the main root mount, because we have unshared, but this prevents our changes to
+    // any subtrees of `/` (e.g. everything) from propagating back to the outside `/`.
+    check(0 == mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL));
+
+    // Mount the rootfs, shards, and workspace.
+    mount_the_world(sandbox_root, sandbox_root, workspace, maps, uid, gid);
+  }
+
+  // We want to request a new PID space, a new mount space, and a new user space
+  int clone_flags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWUSER | SIGCHLD;
+  if ((pid = syscall(SYS_clone, clone_flags, 0, 0, 0, 0)) == 0) {
+    // If we're in here, we have become the "child" process, within the container.
+    
+    // Get rid of the ends of the synchronization pipe that I'm not going to use
     close(child_block[1]);
     close(parent_block[0]);
+    
     // N.B: Capabilities in the original user namespaces are now dropped
     // The kernel may have decided to reset our dumpability, because of
     // the privilege change. However, the parent needs to access our /proc
@@ -561,41 +770,56 @@ int main(int sandbox_argc, char **sandbox_argv) {
     // all signals.
     signal(SIGINT, sigint_handler);
 
-    // Tell the parent we're ready
+    // Tell the parent we're ready, and wait until it signals that it's done
+    // setting up our PID/GID mapping in configure_user_namespace()
     close(parent_block[1]);
-
-    // This will block until the parent closes fds[1]
     check(0 == read(child_block[0], NULL, 1));
 
-    return sandbox_main(sandbox_argc, sandbox_argv);
+    if (execution_mode == PRIVILEGED_CONTAINER_MODE) {
+      // If we are in privileged container mode, let's go ahead and drop back
+      // to the original calling user's UID and GID, which has been mapped to
+      // zero within this container.
+
+      check(0 == setuid(0));
+      check(0 == setgid(0));
+    }
+
+    if (execution_mode == UNPRIVILEGED_CONTAINER_MODE) {
+      // If we're unprivileged, we now take advantage of our new root status
+      // to mount the world.
+      mount_the_world(sandbox_root, sandbox_root, workspace, maps, 0, 0);
+    }
+
+    // Finally, we begin invocation of the target program
+    return sandbox_main(sandbox_root, new_cd, sandbox_argc, sandbox_argv);
   }
+
+  // If we're out here, we are still the "parent" process.  The Prestige lives on.
+
+  // Check to make sure that the clone actually worked
+  check(pid != -1);
+
+  // Get rid of the ends of the synchronization pipe that I'm not going to use.
   close(child_block[0]);
   close(parent_block[1]);
 
   // Wait until the child is ready to be configured.
   check(0 == read(parent_block[0], NULL, 1));
-
   if (verbose) {
     printf("Child Process PID is %d\n", pid);
   }
 
-  configure_user_namespace(pid);
+  // Configure user namespace for the child PID.
+  configure_user_namespace(uid, gid, pid);
 
-  // Resume the child
+  // Signal to the child that it can now continue running.
   close(child_block[1]);
+  
   // Wait until the child exits.
   check(pid == waitpid(pid, &status, 0));
   check(WIFEXITED(status));
-
   if (verbose) {
-      printf("Child Process exited, exit code %d\n", WEXITSTATUS(status));
-  }
-
-  // Delete (empty) work directory
-  {
-      char work_dir_path[PATH_MAX];
-      sprintf(&work_dir_path[0], "%s/work", overlay_workdir);
-      rmdir(&work_dir_path[0]);
+    printf("Child Process exited, exit code %d\n", WEXITSTATUS(status));
   }
 
   // Give back the terminal to the parent
diff --git a/docs/src/FAQ.md b/docs/src/FAQ.md
index cffa185..25b3957 100644
--- a/docs/src/FAQ.md
+++ b/docs/src/FAQ.md
@@ -22,4 +22,8 @@ Absolutely!  There's nothing Julia-specific about the binaries generated by the
 
 ### What platforms are supported?
 
-At the time of writing, we support Linux (x86_64, i686, armv7l, aarch64, ppc64le), Windows (x86_64, i686) and macOS (x86_64).
\ No newline at end of file
+At the time of writing, we support Linux (x86_64, i686, armv7l, aarch64, ppc64le), Windows (x86_64, i686) and macOS (x86_64).
+
+### At line XXX, ABORTED (Operation not permitted)!
+
+Some linux distributions have a bug in their `overlayfs` implementation that prevents us from mounting overlay filesystems within user namespaces.  See [this Ubuntu kernel bug report](https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1531747) for a description of the situation and how Ubuntu has patched it in their kernels.  To work around this, you can launch `BinaryBuilder.jl` in "privileged container" mode.  Unfortunately, this involves running `sudo` every time you launch into a BinaryBuilder session, but on the other hand, this successfully works around the issue on distributions such as Arch linux.  To set "privileged container" mode, set the `BINARYBUILDER_RUNNER` environment variable to `privileged`.
\ No newline at end of file
diff --git a/docs/src/environment_variables.md b/docs/src/environment_variables.md
index be6406c..c7eb937 100644
--- a/docs/src/environment_variables.md
+++ b/docs/src/environment_variables.md
@@ -12,4 +12,6 @@
 
 * `BINARYBUILDER_SHARDS_DIR`: When set to a path, cross-compiler shards will be unpacked/mounted to this location, instead of the default location of `<binarybuilder_root>/deps/shards`.
 
-* `BINARYBUILDER_QEMU_DIR`: When set to a path, qemu/the linux kernel will be installed here (if using the `QemuRunner`) instead of the default location of `<binarybuilder_root>/deps/qemu`
\ No newline at end of file
+* `BINARYBUILDER_QEMU_DIR`: When set to a path, qemu/the linux kernel will be installed here (if using the `QemuRunner`) instead of the default location of `<binarybuilder_root>/deps/qemu`
+
+* `BINARYBUILDER_RUNNER`: When set to a runner string, alters the execution engine that `BinaryBuilder.jl` will use to wrap the build process in a sandbox.  Valid values are one of `"userns"`, `"privileged"` and `"qemu"`.  If not given, `BinaryBuilder.jl` will do its best to guess.
\ No newline at end of file
diff --git a/src/BinaryBuilder.jl b/src/BinaryBuilder.jl
index 78c4e62..a1541fc 100644
--- a/src/BinaryBuilder.jl
+++ b/src/BinaryBuilder.jl
@@ -20,7 +20,7 @@ include("AutoBuild.jl")
 include("Wizard.jl")
 
 function __init__()
-    global downloads_cache, rootfs_cache, use_squashfs, automatic_apple, shards_cache
+    global downloads_cache, rootfs_cache, use_squashfs, automatic_apple, shards_cache, runner_override
     global qemu_cache
 
     # If the user has overridden our rootfs tar location, reflect that here:
@@ -64,6 +64,12 @@ function __init__()
     if get(ENV, "BINARYBUILDER_AUTOMATIC_APPLE", "") == "true"
         automatic_apple = true
     end
+
+    runner_override = lowercase(get(ENV, "BINARYBUILDER_RUNNER", ""))
+    if !(runner_override in ["", "userns", "qemu", "privileged"])
+        warn("BINARYBUILDER_RUNNER value is invalid, ignoring...")
+        runner_override = ""
+    end
 end
 
 """
diff --git a/src/QemuRunner.jl b/src/QemuRunner.jl
index 206303e..49538a6 100644
--- a/src/QemuRunner.jl
+++ b/src/QemuRunner.jl
@@ -182,8 +182,6 @@ function qemu_gen_cmd(qr::QemuRunner, cmd::Cmd, comm_socket_path::String)
         end
 
         close(commsock)
-
-        info("Wrote it all!")
     end
 
     long_cmd = ```
diff --git a/src/RootFS.jl b/src/RootFS.jl
index 8c12c29..5f29ed0 100644
--- a/src/RootFS.jl
+++ b/src/RootFS.jl
@@ -50,7 +50,7 @@ Returns the URL from which a rootfs image (tarball/squashfs) can be downloaded
 function get_shard_url(target::String = "base"; squashfs::Bool = use_squashfs)
     # These constants are what should be updated for a new rootfs build:
     rootfs_urlroot = "https://julialangmirror.s3.amazonaws.com/binarybuilder"
-    rootfs_version = "2018-01-04"
+    rootfs_version = "2018-02-02"
 
     shard_name = "rootfs-$(target)"
     ext = squashfs ? "squashfs" : "tar.gz"
@@ -68,26 +68,26 @@ function get_shard_hash(triplet::String = "base"; squashfs::Bool = use_squashfs)
     # You can get these dictionaries spat out by running `make print-hashes`
     # in `julia-docker/crossbuild`, after running `make shards`.
     squashfs_hashes = Dict(
-        "aarch64-linux-gnu" => "8498d14519a896b2ecb54ff238d3087632d6f7624ce8db63ef8c2fc33f275417",
-        "arm-linux-gnueabihf" => "a23c6be7c3631af76fac4736cd5a90a13fc1a81e6ae21262a50a525b6c0b15ac",
-        "base" => "dad9b3c4b5e5b469a5e4a3558013a75cab009b41f6bec498cea04cc424811a63",
-        "i686-linux-gnu" => "dd700a5e0eff58aff89d1e9667277edf1c61e115d6f468223e48b4378709951a",
-        "i686-w64-mingw32" => "aba0cffb4888ff14531c628fb477d842a0335b5446e41b8647b31cb45fb78d75",
-        "powerpc64le-linux-gnu" => "40cf51c28dd3b1f8fae9eff2e1dea8c23048ef8da86562b1e63b33680c0d0406",
-        "x86_64-apple-darwin14" => "3425b3e251af2887bf05396d1c5f135a9c2807c576ff8b8793a9630eef44b91f",
-        "x86_64-linux-gnu" => "16275a4029f90349893b711bce4f257e7aa8da287b6b6aba622b6f68dcb6fae2",
-        "x86_64-w64-mingw32" => "7a130e2722fda74e26023cac3ce8686568026c82e2de94581801ccb391cee7b5",
+        "aarch64-linux-gnu" => "3f3a35a957be999f4719925a212e6df7e78930057d0dbb5838d326ad90c68d96",
+        "arm-linux-gnueabihf" => "2b65716e5c6bfed7e8193dd3cda63e7d417b9e8e0beb1fe1993a6efaee3c7728",
+        "base" => "6bf9076d58a066ca31bdd87e70db8c3408e18fa2fb6d51afe2a238b91db8dee4",
+        "i686-linux-gnu" => "2e03b0bd44e8084b6819688def0e51908d278a5f0770450b41cba600c759095c",
+        "i686-w64-mingw32" => "1d451012f9ebd71607d1687b33d55f5c0d9e48fe850e815d5c87986e4a004966",
+        "powerpc64le-linux-gnu" => "447c643a8ada01689cdb59ab852c3857a4bafb7ecfa3fefcb18bf057efa57606",
+        "x86_64-apple-darwin14" => "10649f75dd56ff918c461cd8037bfa4b0d562115a7eb79ec09837a2f388e87a6",
+        "x86_64-linux-gnu" => "1b530921f42ed41830d65c1189d96561f876a6035d4881e3442dfe0bf5fe96b1",
+        "x86_64-w64-mingw32" => "cc02add0d8042d42bf0ea9aad2d4dd11b803f46dae1ec268071b8db8f55f5adb",
     )
     tarball_hashes = Dict(
-        "aarch64-linux-gnu" => "2518367658a5d7d436e382877382f83ebc70964a717b107762768d3998a5bcd4",
-        "arm-linux-gnueabihf" => "1a4bd7ff2f2e6e7c4780a41525ac92c55f9b4e53a1723463516363d41e0679f4",
-        "base" => "74476d28ab5b82d674199e126eb845b9d47a0b95fb8f88a9f9b10be4cdea15a1",
-        "i686-linux-gnu" => "362badf81379042cc487ec9ac6d92d91c3b1a6a8c0bebe07fa8f614dd2daf841",
-        "i686-w64-mingw32" => "ecc2452321d6ddaf9fdeb9c9c2ffc0071f81d80900d71186fd4f6c19249f8307",
-        "powerpc64le-linux-gnu" => "8fdbdb3235cb6977c4f6180807ac321a8425b1c2ca5ee1f9eeae2cc5bb11030d",
-        "x86_64-apple-darwin14" => "7c370ddef375dd5722290bfd5dfe92740ba313f02dac6cd741402c68b4f4a57e",
-        "x86_64-linux-gnu" => "57d75ebb07d8e1236b0a88acd3d9e1bba9e4bf3be319f8622c0d6cf8431e6fe6",
-        "x86_64-w64-mingw32" => "dbe747b59f371a148c33bc8c07929bc29948d57155d82d5dad02c4a2c42cd855",
+        "aarch64-linux-gnu" => "556a4cf9b09bf1a10e7bc42f37c453038ec83645dd7b398c60a04c679e410149",
+        "arm-linux-gnueabihf" => "a97d61dc1dfc82bc4e067779125b42b4faae72cfbeffe20eddeb38eb4a2c344d",
+        "base" => "61e6e12d5841c23d027a71f02155cbf5ffab0a3deddf787b406cae84917c1dda",
+        "i686-linux-gnu" => "1f2a8aaa29ceffba9b7653c3c377c1954de5edd57640abcd73adc94f7eb347ac",
+        "i686-w64-mingw32" => "a21bafd2f9f62ee21a1deb8013fa67e206b945a26fe189292bbac59570af37a5",
+        "powerpc64le-linux-gnu" => "e8127fb5b1b2412775595440b90f93e5632af32b733b37fade1f9853e281f954",
+        "x86_64-apple-darwin14" => "c8cfadd55886ca88d5121c1777238b4d77fbc4bbe8cd0e410b4425afb318f976",
+        "x86_64-linux-gnu" => "286fce86eebb66322ef4e1bc9e1d57019a8deb46ce37186bb011448d66235e4d",
+        "x86_64-w64-mingw32" => "297245569871a0d9ee495b167592385701e1c09f16ef1f307f564da2e95b69c6",
     )
 
     if squashfs
diff --git a/src/Runner.jl b/src/Runner.jl
index 249ed1f..373b1e5 100644
--- a/src/Runner.jl
+++ b/src/Runner.jl
@@ -70,8 +70,21 @@ function destdir_envs(destdir::String)
         "PKG_CONFIG_SYSROOT" => destdir)
 end
 
+runner_override = ""
 function preferred_runner()
-    Compat.Sys.islinux() ? UserNSRunner : QemuRunner
+    if runner_override != ""
+        if runner_override in ["userns", "privileged"]
+            return UserNSRunner
+        elseif runner_override in ["qemu"]
+            return QemuRunner
+        end
+    end
+
+    if Compat.Sys.islinux()
+        return UserNSRunner
+    else
+        return QemuRunner
+    end
 end
 
 """
diff --git a/src/UserNSRunner.jl b/src/UserNSRunner.jl
index ee217cd..d6a9946 100644
--- a/src/UserNSRunner.jl
+++ b/src/UserNSRunner.jl
@@ -52,6 +52,12 @@ function UserNSRunner(workspace_root::String; cwd = nothing,
     # Construct sandbox command
     sandbox_cmd = `$(rootfs_dir("sandbox"))`
 
+    # Check to see if we need to run privileged containers.
+    # It would be nice to automatically prefer this eventually....
+    if runner_override == "privileged"
+        sandbox_cmd = `sudo -E $sandbox_cmd`
+    end
+
     if verbose
         sandbox_cmd = `$sandbox_cmd --verbose`
     end
diff --git a/test/.gitignore b/test/.gitignore
index 858fb58..0be801c 100644
--- a/test/.gitignore
+++ b/test/.gitignore
@@ -1,4 +1,4 @@
 build
 downloads
 products
-libfoo_tarballs
\ No newline at end of file
+libfoo_tarballs
diff --git a/test/runtests.jl b/test/runtests.jl
index f3243b3..47b2fd5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -59,7 +59,7 @@ end
             ur = preferred_runner()(prefix.path; cwd="/workspace/", platform=platform)
 
             # Our simple executable file, generated by bash
-            test_exe_sandbox_path = joinpath("/workspace/bin","test_exe")
+            test_exe_sandbox_path = joinpath("/workspace/bin", "test_exe")
             test_exe_path = joinpath(bindir(prefix),"test_exe")
             test_exe = ExecutableProduct(test_exe_path)
             results = [test_exe]
@@ -91,7 +91,7 @@ end
             /usr/bin/make clean
             /usr/bin/make install
             """
-            dep = Dependency("foo", [libfoo, fooifier], script, platform, prefix)
+            dep = Dependency("fooifier", [libfoo, fooifier], script, platform, prefix)
 
             # Build it
             @test build(ur, dep; verbose=true)