Skip to content

Commit 04cbb13

Browse files
nlacassegvisor-bot
authored andcommitted
Give each container a distinct MountNamespace.
This keeps all container filesystem completely separate from eachother (including from the root container filesystem), and allows us to get rid of the "__runsc_containers__" directory. It also simplifies container startup/teardown as we don't have to muck around in the root container's filesystem. PiperOrigin-RevId: 259613346
1 parent 5774599 commit 04cbb13

File tree

7 files changed

+366
-163
lines changed

7 files changed

+366
-163
lines changed

pkg/sentry/control/proc.go

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@ type ExecArgs struct {
5454
// Envv is a list of environment variables.
5555
Envv []string `json:"envv"`
5656

57+
// MountNamespace is the mount namespace to execute the new process in.
58+
// A reference on MountNamespace must be held for the lifetime of the
59+
// ExecArgs. If MountNamespace is nil, it will default to the kernel's
60+
// root MountNamespace.
61+
MountNamespace *fs.MountNamespace
62+
5763
// Root defines the root directory for the new process. A reference on
5864
// Root must be held for the lifetime of the ExecArgs. If Root is nil,
5965
// it will default to the VFS root.
@@ -145,6 +151,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
145151
Argv: args.Argv,
146152
Envv: args.Envv,
147153
WorkingDirectory: args.WorkingDirectory,
154+
MountNamespace: args.MountNamespace,
148155
Root: args.Root,
149156
Credentials: creds,
150157
FDTable: fdTable,
@@ -157,16 +164,25 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
157164
ContainerID: args.ContainerID,
158165
}
159166
if initArgs.Root != nil {
160-
// initArgs must hold a reference on Root. This ref is dropped
161-
// in CreateProcess.
167+
// initArgs must hold a reference on Root, which will be
168+
// donated to the new process in CreateProcess.
162169
initArgs.Root.IncRef()
163170
}
171+
if initArgs.MountNamespace != nil {
172+
// initArgs must hold a reference on MountNamespace, which will
173+
// be donated to the new process in CreateProcess.
174+
initArgs.MountNamespace.IncRef()
175+
}
164176
ctx := initArgs.NewContext(proc.Kernel)
165177

166178
if initArgs.Filename == "" {
167179
// Get the full path to the filename from the PATH env variable.
168180
paths := fs.GetPath(initArgs.Envv)
169-
f, err := proc.Kernel.RootMountNamespace().ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
181+
mns := initArgs.MountNamespace
182+
if mns == nil {
183+
mns = proc.Kernel.RootMountNamespace()
184+
}
185+
f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
170186
if err != nil {
171187
return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
172188
}

runsc/boot/fs.go

Lines changed: 21 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,6 @@ const (
5454
// MountPrefix is the annotation prefix for mount hints.
5555
MountPrefix = "gvisor.dev/spec/mount"
5656

57-
// ChildContainersDir is the directory where child container root
58-
// filesystems are mounted.
59-
ChildContainersDir = "/__runsc_containers__"
60-
6157
// Filesystems that runsc supports.
6258
bind = "bind"
6359
devpts = "devpts"
@@ -256,10 +252,10 @@ func subtargets(root string, mnts []specs.Mount) []string {
256252

257253
// setExecutablePath sets the procArgs.Filename by searching the PATH for an
258254
// executable matching the procArgs.Argv[0].
259-
func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
255+
func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
260256
paths := fs.GetPath(procArgs.Envv)
261257
exe := procArgs.Argv[0]
262-
f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
258+
f, err := procArgs.MountNamespace.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
263259
if err != nil {
264260
return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
265261
}
@@ -514,11 +510,16 @@ func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *
514510

515511
// If this is the root container, we also need to setup the root mount
516512
// namespace.
517-
mns := c.k.RootMountNamespace()
518-
if mns == nil {
513+
rootMNS := c.k.RootMountNamespace()
514+
if rootMNS == nil {
519515
// Setup the root container.
520-
if err := c.setupRootContainer(ctx, rootCtx, conf, func(mns *fs.MountNamespace) {
521-
c.k.SetRootMountNamespace(mns)
516+
if err := c.setupRootContainer(ctx, rootCtx, conf, func(rootMNS *fs.MountNamespace) {
517+
// The callback to setupRootContainer inherits a
518+
// reference on the rootMNS, so we don't need to take
519+
// an additional reference here.
520+
procArgs.MountNamespace = rootMNS
521+
procArgs.Root = rootMNS.Root()
522+
c.k.SetRootMountNamespace(rootMNS)
522523
}); err != nil {
523524
return err
524525
}
@@ -527,54 +528,26 @@ func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *
527528

528529
// Setup a child container.
529530
log.Infof("Creating new process in child container.")
530-
globalRoot := mns.Root()
531-
defer globalRoot.DecRef()
532-
533-
// Create mount point for the container's rootfs.
534-
maxTraversals := uint(0)
535-
contDir, err := mns.FindInode(ctx, globalRoot, nil, ChildContainersDir, &maxTraversals)
536-
if err != nil {
537-
return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err)
538-
}
539-
if err := contDir.CreateDirectory(ctx, globalRoot, c.cid, fs.FilePermsFromMode(0755)); err != nil {
540-
return fmt.Errorf("create directory %q: %v", c.cid, err)
541-
}
542-
containerRoot, err := contDir.Walk(ctx, globalRoot, c.cid)
543-
if err != nil {
544-
return fmt.Errorf("walk to %q failed: %v", c.cid, err)
545-
}
546-
defer containerRoot.DecRef()
547531

548-
// Create the container's root filesystem mount.
532+
// Create a new root inode and mount namespace for the container.
549533
rootInode, err := c.createRootMount(rootCtx, conf)
550534
if err != nil {
551535
return fmt.Errorf("creating filesystem for container: %v", err)
552536
}
553-
554-
// Mount the container's root filesystem to the newly created mount point.
555-
if err := mns.Mount(ctx, containerRoot, rootInode); err != nil {
556-
return fmt.Errorf("mount container root: %v", err)
557-
}
558-
559-
// We have to re-walk to the dirent to find the mounted directory. The old
560-
// dirent is invalid at this point.
561-
containerRoot, err = contDir.Walk(ctx, globalRoot, c.cid)
537+
mns, err := fs.NewMountNamespace(rootCtx, rootInode)
562538
if err != nil {
563-
return fmt.Errorf("find container mount point %q: %v", c.cid, err)
539+
return fmt.Errorf("creating new mount namespace for container: %v", err)
564540
}
565-
cu := specutils.MakeCleanup(func() { containerRoot.DecRef() })
566-
defer cu.Clean()
567-
568-
log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, c.cid))
569541

570542
// Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
571-
procArgs.Root = containerRoot
543+
// This will also donate a reference to procArgs, as required.
544+
procArgs.MountNamespace = mns
545+
procArgs.Root = mns.Root()
572546

573547
// Mount all submounts.
574-
if err := c.mountSubmounts(rootCtx, conf, mns, containerRoot); err != nil {
548+
if err := c.mountSubmounts(rootCtx, conf, mns, procArgs.Root); err != nil {
575549
return err
576550
}
577-
cu.Release()
578551
return c.checkDispenser()
579552
}
580553

@@ -585,75 +558,11 @@ func (c *containerMounter) checkDispenser() error {
585558
return nil
586559
}
587560

588-
// destroyContainerFS cleans up the filesystem by unmounting all mounts for the
589-
// given container and deleting the container root directory.
590-
func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error {
591-
defer func() {
592-
// Flushing dirent references triggers many async close
593-
// operations. We must wait for those to complete before
594-
// returning, otherwise the caller may kill the gofer before
595-
// they complete, causing a cascade of failing RPCs.
596-
//
597-
// This must take place in the first deferred function, so that
598-
// it runs after all the other deferred DecRef() calls in this
599-
// function.
600-
log.Infof("Waiting for async filesystem operations to complete")
601-
fs.AsyncBarrier()
602-
}()
603-
604-
// First get a reference to the container root directory.
605-
mns := k.RootMountNamespace()
606-
mnsRoot := mns.Root()
607-
defer mnsRoot.DecRef()
608-
containerRoot := path.Join(ChildContainersDir, cid)
609-
maxTraversals := uint(0)
610-
containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, &maxTraversals)
611-
if err == syserror.ENOENT {
612-
// Container must have been destroyed already. That's fine.
613-
return nil
614-
}
615-
if err != nil {
616-
return fmt.Errorf("finding container root directory %q: %v", containerRoot, err)
617-
}
618-
defer containerRootDirent.DecRef()
619-
620-
// Iterate through all submounts and unmount them. We unmount lazily by
621-
// setting detach=true, so we can unmount in any order.
622-
mnt := mns.FindMount(containerRootDirent)
623-
for _, m := range mns.AllMountsUnder(mnt) {
624-
root := m.Root()
625-
defer root.DecRef()
626-
627-
// Do a best-effort unmount by flushing the refs and unmount
628-
// with "detach only = true". Unmount returns EINVAL when the mount point
629-
// doesn't exist, i.e. it has already been unmounted.
630-
log.Debugf("Unmounting container mount %q", root.BaseName())
631-
root.Inode.MountSource.FlushDirentRefs()
632-
if err := mns.Unmount(ctx, root, true /* detach only */); err != nil && err != syserror.EINVAL {
633-
return fmt.Errorf("unmounting container mount %q: %v", root.BaseName(), err)
634-
}
635-
}
636-
637-
// Get a reference to the parent directory and remove the root
638-
// container directory.
639-
maxTraversals = 0
640-
containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, &maxTraversals)
641-
if err != nil {
642-
return fmt.Errorf("finding containers directory %q: %v", ChildContainersDir, err)
643-
}
644-
defer containersDirDirent.DecRef()
645-
log.Debugf("Deleting container root %q", containerRoot)
646-
if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil {
647-
return fmt.Errorf("removing directory %q: %v", containerRoot, err)
648-
}
649-
650-
return nil
651-
}
652-
653561
// setupRootContainer creates a mount namespace containing the root filesystem
654562
// and all mounts. 'rootCtx' is used to walk directories to find mount points.
655-
// 'setMountNS' is called after namespace is created. It must set the mount NS
656-
// to 'rootCtx'.
563+
// The 'setMountNS' callback is called after the mount namespace is created and
564+
// will get a reference on that namespace. The callback must ensure that the
565+
// rootCtx has the provided mount namespace.
657566
func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error {
658567
for _, hint := range c.hints.mounts {
659568
log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
@@ -664,13 +573,6 @@ func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx c
664573
hint.root = inode
665574
}
666575

667-
// Create a tmpfs mount where we create and mount a root filesystem for
668-
// each child container.
669-
c.mounts = append(c.mounts, specs.Mount{
670-
Type: tmpfs,
671-
Destination: ChildContainersDir,
672-
})
673-
674576
rootInode, err := c.createRootMount(rootCtx, conf)
675577
if err != nil {
676578
return fmt.Errorf("creating root mount: %v", err)

runsc/boot/loader.go

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
"gvisor.dev/gvisor/pkg/refs"
3636
"gvisor.dev/gvisor/pkg/sentry/arch"
3737
"gvisor.dev/gvisor/pkg/sentry/control"
38+
"gvisor.dev/gvisor/pkg/sentry/fs"
3839
"gvisor.dev/gvisor/pkg/sentry/fs/host"
3940
"gvisor.dev/gvisor/pkg/sentry/inet"
4041
"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -525,8 +526,7 @@ func (l *Loader) run() error {
525526
}
526527

527528
rootCtx := l.rootProcArgs.NewContext(l.k)
528-
rootMns := l.k.RootMountNamespace()
529-
if err := setExecutablePath(rootCtx, rootMns, &l.rootProcArgs); err != nil {
529+
if err := setExecutablePath(rootCtx, &l.rootProcArgs); err != nil {
530530
return err
531531
}
532532

@@ -540,7 +540,7 @@ func (l *Loader) run() error {
540540
}
541541
}
542542
if !hasHomeEnvv {
543-
homeDir, err := getExecUserHome(rootCtx, rootMns, uint32(l.rootProcArgs.Credentials.RealKUID))
543+
homeDir, err := getExecUserHome(rootCtx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
544544
if err != nil {
545545
return fmt.Errorf("error reading exec user: %v", err)
546546
}
@@ -663,8 +663,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
663663
return fmt.Errorf("configuring container FS: %v", err)
664664
}
665665

666-
mns := l.k.RootMountNamespace()
667-
if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
666+
if err := setExecutablePath(ctx, &procArgs); err != nil {
668667
return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
669668
}
670669

@@ -689,8 +688,10 @@ func (l *Loader) destroyContainer(cid string) error {
689688
defer l.mu.Unlock()
690689

691690
// Has the container started?
692-
if _, _, err := l.threadGroupFromIDLocked(execID{cid: cid}); err == nil {
693-
// If the container has started, kill and wait for all processes.
691+
_, _, err := l.threadGroupFromIDLocked(execID{cid: cid})
692+
693+
// If the container has started, kill and wait for all processes.
694+
if err == nil {
694695
if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
695696
return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
696697
}
@@ -703,12 +704,17 @@ func (l *Loader) destroyContainer(cid string) error {
703704
}
704705
}
705706

706-
ctx := l.rootProcArgs.NewContext(l.k)
707-
if err := destroyContainerFS(ctx, cid, l.k); err != nil {
708-
return fmt.Errorf("destroying filesystem for container %q: %v", cid, err)
709-
}
707+
// At this point, all processes inside of the container have exited,
708+
// releasing all references to the container's MountNamespace and
709+
// causing all submounts and overlays to be unmounted.
710+
//
711+
// Since the container's MountNamespace has been released,
712+
// MountNamespace.destroy() will have executed, but that function may
713+
// trigger async close operations. We must wait for those to complete
714+
// before returning, otherwise the caller may kill the gofer before
715+
// they complete, causing a cascade of failing RPCs.
716+
fs.AsyncBarrier()
710717

711-
// We made it!
712718
log.Debugf("Container destroyed %q", cid)
713719
return nil
714720
}
@@ -724,14 +730,22 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
724730
return 0, fmt.Errorf("no such container: %q", args.ContainerID)
725731
}
726732

727-
// Get the container Root Dirent from the Task, since we must run this
728-
// process with the same Root.
733+
// Get the container Root Dirent and MountNamespace from the Task.
729734
tg.Leader().WithMuLocked(func(t *kernel.Task) {
735+
// FSContext.RootDirectory() will take an extra ref for us.
730736
args.Root = t.FSContext().RootDirectory()
737+
738+
// task.MountNamespace() does not take a ref, so we must do so
739+
// ourselves.
740+
args.MountNamespace = t.MountNamespace()
741+
args.MountNamespace.IncRef()
731742
})
732-
if args.Root != nil {
733-
defer args.Root.DecRef()
734-
}
743+
defer func() {
744+
if args.Root != nil {
745+
args.Root.DecRef()
746+
}
747+
args.MountNamespace.DecRef()
748+
}()
735749

736750
// Start the process.
737751
proc := control.Proc{Kernel: l.k}

0 commit comments

Comments
 (0)