462 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Go
		
	
	
	
			
		
		
	
	
			462 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Go
		
	
	
	
| // Package cap provides all the Linux Capabilities userspace library API
 | |
| // bindings in native Go.
 | |
| //
 | |
| // Capabilities are a feature of the Linux kernel that allow fine
 | |
| // grain permissions to perform privileged operations. Privileged
 | |
| // operations are required to do irregular system level operations
 | |
| // from code. You can read more about how Capabilities are intended to
 | |
| // work here:
 | |
| //
 | |
| //   https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33528.pdf
 | |
| //
 | |
| // This package supports native Go bindings for all the features
 | |
| // described in that paper as well as supporting subsequent changes to
 | |
| // the kernel for other styles of inheritable Capability.
 | |
| //
 | |
| // Some simple things you can do with this package are:
 | |
| //
 | |
| //   // Read and display the capabilities of the running process
 | |
| //   c := cap.GetProc()
 | |
| //   log.Printf("this process has these caps:", c)
 | |
| //
 | |
| //   // Drop any privilege a process might have (including for root,
 | |
| //   // but note root 'owns' a lot of system files so a cap-limited
 | |
| //   // root can still do considerable damage to a running system).
 | |
| //   old := cap.GetProc()
 | |
| //   empty := cap.NewSet()
 | |
| //   if err := empty.SetProc(); err != nil {
 | |
| //       log.Fatalf("failed to drop privilege: %q -> %q: %v", old, empty, err)
 | |
| //   }
 | |
| //   now := cap.GetProc()
 | |
| //   if cap.Differs(now.Compare(empty)) {
 | |
| //       log.Fatalf("failed to fully drop privilege: have=%q, wanted=%q", now, empty)
 | |
| //   }
 | |
| //
 | |
| // See https://sites.google.com/site/fullycapable/ for recent updates,
 | |
| // some more complete walk-through examples of ways of using
 | |
| // 'cap.Set's etc and information on how to file bugs.
 | |
| //
 | |
| // For CGo linked binaries, behind the scenes, the package
 | |
| // "kernel.org/pub/linux/libs/security/libcap/psx" is used to perform
 | |
| // POSIX semantics system calls that manipulate thread state
 | |
| // uniformly over the whole Go (and CGo linked) process runtime.
 | |
| //
 | |
| // Note, if the Go runtime syscall interface contains the Linux
 | |
| // variant syscall.AllThreadsSyscall() API (it debuted in go1.16 see
 | |
| // https://github.com/golang/go/issues/1435 for its history) then
 | |
| // the "psx" package will use that to invoke Capability setting system
 | |
| // calls in pure Go binaries. In such an enhanced Go runtime, to force
 | |
| // this behavior, use the CGO_ENABLED=0 environment variable.
 | |
| //
 | |
| //
 | |
| // Copyright (c) 2019-21 Andrew G. Morgan <morgan@kernel.org>
 | |
| //
 | |
| // The cap and psx packages are licensed with a (you choose) BSD
 | |
| // 3-clause or GPL2. See LICENSE file for details.
 | |
| package cap // import "kernel.org/pub/linux/libs/security/libcap/cap"
 | |
| 
 | |
| import (
 | |
| 	"errors"
 | |
| 	"sort"
 | |
| 	"sync"
 | |
| 	"syscall"
 | |
| 	"unsafe"
 | |
| )
 | |
| 
 | |
| // Value is the type of a single capability (or permission) bit.
 | |
| type Value uint
 | |
| 
 | |
| // Flag is the type of one of the three Value dimensions held in a
 | |
| // Set.  It is also used in the (*IAB).Fill() method for changing the
 | |
| // Bounding and Ambient Vectors.
 | |
| type Flag uint
 | |
| 
 | |
| // Effective, Permitted, Inheritable are the three Flags of Values
 | |
| // held in a Set.
 | |
| const (
 | |
| 	Effective Flag = iota
 | |
| 	Permitted
 | |
| 	Inheritable
 | |
| )
 | |
| 
 | |
| // String identifies a Flag value by its conventional "e", "p" or "i"
 | |
| // string abbreviation.
 | |
| func (f Flag) String() string {
 | |
| 	switch f {
 | |
| 	case Effective:
 | |
| 		return "e"
 | |
| 	case Permitted:
 | |
| 		return "p"
 | |
| 	case Inheritable:
 | |
| 		return "i"
 | |
| 	default:
 | |
| 		return "<Error>"
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // data holds a 32-bit slice of the compressed bitmaps of capability
 | |
| // sets as understood by the kernel.
 | |
| type data [Inheritable + 1]uint32
 | |
| 
 | |
| // Set is an opaque capabilities container for a set of system
 | |
| // capbilities. It holds individually addressable capability Value's
 | |
| // for the three capability Flag's. See GetFlag() and SetFlag() for
 | |
| // how to adjust them individually, and Clear() and ClearFlag() for
 | |
| // how to do bulk operations.
 | |
| //
 | |
| // For admin tasks associated with managing namespace specific file
 | |
| // capabilities, Set can also support a namespace-root-UID value which
 | |
| // defaults to zero. See GetNSOwner() and SetNSOwner().
 | |
| type Set struct {
 | |
| 	// mu protects all other members of a Set.
 | |
| 	mu sync.RWMutex
 | |
| 
 | |
| 	// flat holds Flag Value bitmaps for all capabilities
 | |
| 	// associated with this Set.
 | |
| 	flat []data
 | |
| 
 | |
| 	// Linux specific
 | |
| 	nsRoot int
 | |
| }
 | |
| 
 | |
| // Various known kernel magic values.
 | |
| const (
 | |
| 	kv1 = 0x19980330 // First iteration of process capabilities (32 bits).
 | |
| 	kv2 = 0x20071026 // First iteration of process and file capabilities (64 bits) - deprecated.
 | |
| 	kv3 = 0x20080522 // Most recently supported process and file capabilities (64 bits).
 | |
| )
 | |
| 
 | |
| var (
 | |
| 	// starUp protects setting of the following values: magic,
 | |
| 	// words, maxValues.
 | |
| 	startUp sync.Once
 | |
| 
 | |
| 	// magic holds the preferred magic number for the kernel ABI.
 | |
| 	magic uint32
 | |
| 
 | |
| 	// words holds the number of uint32's associated with each
 | |
| 	// capability Flag for this session.
 | |
| 	words int
 | |
| 
 | |
| 	// maxValues holds the number of bit values that are named by
 | |
| 	// the running kernel. This is generally expected to match
 | |
| 	// ValueCount which is autogenerated at packaging time.
 | |
| 	maxValues uint
 | |
| )
 | |
| 
 | |
| type header struct {
 | |
| 	magic uint32
 | |
| 	pid   int32
 | |
| }
 | |
| 
 | |
| // scwMu is used to fully serialize the write system calls. Note, this
 | |
| // is generally not necesary, but in the case of Launch we get into a
 | |
| // situation where the launching thread is temporarily allowed to
 | |
| // deviate from the kernel state of the rest of the runtime and
 | |
| // allowing other threads to perform w* syscalls will potentially
 | |
| // interfere with the launching process.
 | |
| var scwMu sync.Mutex
 | |
| 
 | |
| // syscaller is a type for abstracting syscalls. The r* variants are
 | |
| // for reading state, and can be parallelized, the w* variants need to
 | |
| // be serialized so all OS threads can share state.
 | |
| type syscaller struct {
 | |
| 	r3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno)
 | |
| 	w3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno)
 | |
| 	r6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno)
 | |
| 	w6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno)
 | |
| }
 | |
| 
 | |
| // caprcall provides a pointer etc wrapper for the system calls
 | |
| // associated with getcap.
 | |
| //go:uintptrescapes
 | |
| func (sc *syscaller) caprcall(call uintptr, h *header, d []data) error {
 | |
| 	x := uintptr(0)
 | |
| 	if d != nil {
 | |
| 		x = uintptr(unsafe.Pointer(&d[0]))
 | |
| 	}
 | |
| 	_, _, err := sc.r3(call, uintptr(unsafe.Pointer(h)), x, 0)
 | |
| 	if err != 0 {
 | |
| 		return err
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // capwcall provides a pointer etc wrapper for the system calls
 | |
| // associated with setcap.
 | |
| //go:uintptrescapes
 | |
| func (sc *syscaller) capwcall(call uintptr, h *header, d []data) error {
 | |
| 	x := uintptr(0)
 | |
| 	if d != nil {
 | |
| 		x = uintptr(unsafe.Pointer(&d[0]))
 | |
| 	}
 | |
| 	_, _, err := sc.w3(call, uintptr(unsafe.Pointer(h)), x, 0)
 | |
| 	if err != 0 {
 | |
| 		return err
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // prctlrcall provides a wrapper for the prctl systemcalls that only
 | |
| // read kernel state. There is a limited number of arguments needed
 | |
| // and the caller should use 0 for those not needed.
 | |
| func (sc *syscaller) prctlrcall(prVal, v1, v2 uintptr) (int, error) {
 | |
| 	r, _, err := sc.r3(syscall.SYS_PRCTL, prVal, v1, v2)
 | |
| 	if err != 0 {
 | |
| 		return int(r), err
 | |
| 	}
 | |
| 	return int(r), nil
 | |
| }
 | |
| 
 | |
| // prctlrcall6 provides a wrapper for the prctl systemcalls that only
 | |
| // read kernel state and require 6 arguments - ambient cap API, I'm
 | |
| // looking at you. There is a limited number of arguments needed and
 | |
| // the caller should use 0 for those not needed.
 | |
| func (sc *syscaller) prctlrcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) {
 | |
| 	r, _, err := sc.r6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5)
 | |
| 	if err != 0 {
 | |
| 		return int(r), err
 | |
| 	}
 | |
| 	return int(r), nil
 | |
| }
 | |
| 
 | |
| // prctlwcall provides a wrapper for the prctl systemcalls that
 | |
| // write/modify kernel state. Where available, these will use the
 | |
| // POSIX semantics fixup system calls. There is a limited number of
 | |
| // arguments needed and the caller should use 0 for those not needed.
 | |
| func (sc *syscaller) prctlwcall(prVal, v1, v2 uintptr) (int, error) {
 | |
| 	r, _, err := sc.w3(syscall.SYS_PRCTL, prVal, v1, v2)
 | |
| 	if err != 0 {
 | |
| 		return int(r), err
 | |
| 	}
 | |
| 	return int(r), nil
 | |
| }
 | |
| 
 | |
| // prctlwcall6 provides a wrapper for the prctl systemcalls that
 | |
| // write/modify kernel state and require 6 arguments - ambient cap
 | |
| // API, I'm looking at you. (Where available, these will use the POSIX
 | |
| // semantics fixup system calls). There is a limited number of
 | |
| // arguments needed and the caller should use 0 for those not needed.
 | |
| func (sc *syscaller) prctlwcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) {
 | |
| 	r, _, err := sc.w6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5)
 | |
| 	if err != 0 {
 | |
| 		return int(r), err
 | |
| 	}
 | |
| 	return int(r), nil
 | |
| }
 | |
| 
 | |
| // cInit perfoms the lazy identification of the capability vintage of
 | |
| // the running system.
 | |
| func (sc *syscaller) cInit() {
 | |
| 	h := &header{
 | |
| 		magic: kv3,
 | |
| 	}
 | |
| 	sc.caprcall(syscall.SYS_CAPGET, h, nil)
 | |
| 	magic = h.magic
 | |
| 	switch magic {
 | |
| 	case kv1:
 | |
| 		words = 1
 | |
| 	case kv2, kv3:
 | |
| 		words = 2
 | |
| 	default:
 | |
| 		// Fall back to a known good version.
 | |
| 		magic = kv3
 | |
| 		words = 2
 | |
| 	}
 | |
| 	// Use the bounding set to evaluate which capabilities exist.
 | |
| 	maxValues = uint(sort.Search(32*words, func(n int) bool {
 | |
| 		_, err := GetBound(Value(n))
 | |
| 		return err != nil
 | |
| 	}))
 | |
| 	if maxValues == 0 {
 | |
| 		// Fall back to using the largest value defined at build time.
 | |
| 		maxValues = NamedCount
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // MaxBits returns the number of kernel-named capabilities discovered
 | |
| // at runtime in the current system.
 | |
| func MaxBits() Value {
 | |
| 	startUp.Do(multisc.cInit)
 | |
| 	return Value(maxValues)
 | |
| }
 | |
| 
 | |
| // NewSet returns an empty capability set.
 | |
| func NewSet() *Set {
 | |
| 	startUp.Do(multisc.cInit)
 | |
| 	return &Set{
 | |
| 		flat: make([]data, words),
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // ErrBadSet indicates a nil pointer was used for a *Set, or the
 | |
| // request of the Set is invalid in some way.
 | |
| var ErrBadSet = errors.New("bad capability set")
 | |
| 
 | |
| // Dup returns a copy of the specified capability set.
 | |
| func (c *Set) Dup() (*Set, error) {
 | |
| 	if c == nil || len(c.flat) == 0 {
 | |
| 		return nil, ErrBadSet
 | |
| 	}
 | |
| 	n := NewSet()
 | |
| 	c.mu.RLock()
 | |
| 	defer c.mu.RUnlock()
 | |
| 	copy(n.flat, c.flat)
 | |
| 	n.nsRoot = c.nsRoot
 | |
| 	return n, nil
 | |
| }
 | |
| 
 | |
| // GetPID returns the capability set associated with the target process
 | |
| // id; pid=0 is an alias for current.
 | |
| func GetPID(pid int) (*Set, error) {
 | |
| 	v := NewSet()
 | |
| 	if err := multisc.caprcall(syscall.SYS_CAPGET, &header{magic: magic, pid: int32(pid)}, v.flat); err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	return v, nil
 | |
| }
 | |
| 
 | |
| // GetProc returns the capability Set of the current process. If the
 | |
| // kernel is unable to determine the Set associated with the current
 | |
| // process, the function panic()s.
 | |
| func GetProc() *Set {
 | |
| 	c, err := GetPID(0)
 | |
| 	if err != nil {
 | |
| 		panic(err)
 | |
| 	}
 | |
| 	return c
 | |
| }
 | |
| 
 | |
| func (sc *syscaller) setProc(c *Set) error {
 | |
| 	if c == nil || len(c.flat) == 0 {
 | |
| 		return ErrBadSet
 | |
| 	}
 | |
| 	return sc.capwcall(syscall.SYS_CAPSET, &header{magic: magic}, c.flat)
 | |
| }
 | |
| 
 | |
| // SetProc attempts to set the capability Set of the current
 | |
| // process. The kernel will perform permission checks and an error
 | |
| // will be returned if the attempt fails. Should the attempt fail
 | |
| // no process capabilities will have been modified.
 | |
| func (c *Set) SetProc() error {
 | |
| 	scwMu.Lock()
 | |
| 	defer scwMu.Unlock()
 | |
| 	return multisc.setProc(c)
 | |
| }
 | |
| 
 | |
| // defines from uapi/linux/prctl.h
 | |
| const (
 | |
| 	prCapBSetRead = 23
 | |
| 	prCapBSetDrop = 24
 | |
| )
 | |
| 
 | |
| // GetBound determines if a specific capability is currently part of
 | |
| // the local bounding set. On systems where the bounding set Value is
 | |
| // not present, this function returns an error.
 | |
| func GetBound(val Value) (bool, error) {
 | |
| 	v, err := multisc.prctlrcall(prCapBSetRead, uintptr(val), 0)
 | |
| 	if err != nil {
 | |
| 		return false, err
 | |
| 	}
 | |
| 	return v > 0, nil
 | |
| }
 | |
| 
 | |
| //go:uintptrescapes
 | |
| func (sc *syscaller) dropBound(val ...Value) error {
 | |
| 	for _, v := range val {
 | |
| 		if _, err := sc.prctlwcall(prCapBSetDrop, uintptr(v), 0); err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // DropBound attempts to suppress bounding set Values. The kernel will
 | |
| // never allow a bounding set Value bit to be raised once successfully
 | |
| // dropped. However, dropping requires the current process is
 | |
| // sufficiently capable (usually via cap.SETPCAP being raised in the
 | |
| // Effective flag of the process' Set). Note, the drops are performed
 | |
| // in order and if one bounding value cannot be dropped, the function
 | |
| // returns immediately with an error which may leave the system in an
 | |
| // ill-defined state. The caller can determine where things went wrong
 | |
| // using GetBound().
 | |
| func DropBound(val ...Value) error {
 | |
| 	scwMu.Lock()
 | |
| 	defer scwMu.Unlock()
 | |
| 	return multisc.dropBound(val...)
 | |
| }
 | |
| 
 | |
| // defines from uapi/linux/prctl.h
 | |
| const (
 | |
| 	prCapAmbient = 47
 | |
| 
 | |
| 	prCapAmbientIsSet    = 1
 | |
| 	prCapAmbientRaise    = 2
 | |
| 	prCapAmbientLower    = 3
 | |
| 	prCapAmbientClearAll = 4
 | |
| )
 | |
| 
 | |
| // GetAmbient determines if a specific capability is currently part of
 | |
| // the local ambient set. On systems where the ambient set Value is
 | |
| // not present, this function returns an error.
 | |
| func GetAmbient(val Value) (bool, error) {
 | |
| 	r, err := multisc.prctlrcall6(prCapAmbient, prCapAmbientIsSet, uintptr(val), 0, 0, 0)
 | |
| 	return r > 0, err
 | |
| }
 | |
| 
 | |
| //go:uintptrescapes
 | |
| func (sc *syscaller) setAmbient(enable bool, val ...Value) error {
 | |
| 	dir := uintptr(prCapAmbientLower)
 | |
| 	if enable {
 | |
| 		dir = prCapAmbientRaise
 | |
| 	}
 | |
| 	for _, v := range val {
 | |
| 		_, err := sc.prctlwcall6(prCapAmbient, dir, uintptr(v), 0, 0, 0)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // SetAmbient attempts to set a specific Value bit to the state,
 | |
| // enable. This function will return an error if insufficient
 | |
| // permission is available to perform this task. The settings are
 | |
| // performed in order and the function returns immediately an error is
 | |
| // detected. Use GetAmbient() to unravel where things went
 | |
| // wrong. Note, the cap package manages an abstraction IAB that
 | |
| // captures all three inheritable vectors in a single type. Consider
 | |
| // using that.
 | |
| func SetAmbient(enable bool, val ...Value) error {
 | |
| 	scwMu.Lock()
 | |
| 	defer scwMu.Unlock()
 | |
| 	return multisc.setAmbient(enable, val...)
 | |
| }
 | |
| 
 | |
| func (sc *syscaller) resetAmbient() error {
 | |
| 	var v bool
 | |
| 	var err error
 | |
| 
 | |
| 	for c := Value(0); !v; c++ {
 | |
| 		if v, err = GetAmbient(c); err != nil {
 | |
| 			// no non-zero values found.
 | |
| 			return nil
 | |
| 		}
 | |
| 	}
 | |
| 	_, err = sc.prctlwcall6(prCapAmbient, prCapAmbientClearAll, 0, 0, 0, 0)
 | |
| 	return err
 | |
| }
 | |
| 
 | |
| // ResetAmbient attempts to ensure the Ambient set is fully
 | |
| // cleared. It works by first reading the set and if it finds any bits
 | |
| // raised it will attempt a reset. The test before attempting a reset
 | |
| // behavior is a workaround for situations where the Ambient API is
 | |
| // locked, but a reset is not actually needed. No Ambient bit not
 | |
| // already raised in both the Permitted and Inheritable Set is allowed
 | |
| // to be raised by the kernel.
 | |
| func ResetAmbient() error {
 | |
| 	scwMu.Lock()
 | |
| 	defer scwMu.Unlock()
 | |
| 	return multisc.resetAmbient()
 | |
| }
 |