545 lines
14 KiB
C
545 lines
14 KiB
C
/* Copyright 2017 The Chromium OS Authors. All rights reserved.
|
|
* Use of this source code is governed by a BSD-style license that can be
|
|
* found in the LICENSE file.
|
|
*/
|
|
|
|
#include "system.h"
|
|
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <grp.h>
|
|
#include <net/if.h>
|
|
#include <pwd.h>
|
|
#include <stdbool.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <sys/ioctl.h>
|
|
#include <sys/prctl.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/statvfs.h>
|
|
#include <unistd.h>
|
|
|
|
#include <linux/securebits.h>
|
|
|
|
#include "syscall_wrapper.h"
|
|
#include "util.h"
|
|
|
|
/*
|
|
* SECBIT_NO_CAP_AMBIENT_RAISE was added in kernel 4.3, so fill in the
|
|
* definition if the securebits header doesn't provide it.
|
|
*/
|
|
#ifndef SECBIT_NO_CAP_AMBIENT_RAISE
|
|
#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(6))
|
|
#endif
|
|
|
|
#ifndef SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED
|
|
#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED (issecure_mask(7))
|
|
#endif
|
|
|
|
/*
|
|
* Assert the value of SECURE_ALL_BITS at compile-time.
|
|
* Android devices are currently compiled against 4.4 kernel headers. Kernel 4.3
|
|
* added a new securebit.
|
|
* When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM
|
|
* when used on older kernels. The compile-time assert will catch this situation
|
|
* at compile time.
|
|
*/
|
|
#if defined(__ANDROID__)
|
|
_Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55.");
|
|
#endif
|
|
|
|
/* Used by lookup_(user|group) functions. */
|
|
#define MAX_PWENT_SZ (1 << 20)
|
|
#define MAX_GRENT_SZ (1 << 20)
|
|
|
|
int secure_noroot_set_and_locked(uint64_t mask)
|
|
{
|
|
return (mask & (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED)) ==
|
|
(SECBIT_NOROOT | SECBIT_NOROOT_LOCKED);
|
|
}
|
|
|
|
int lock_securebits(uint64_t skip_mask, bool require_keep_caps)
|
|
{
|
|
/* The general idea is to set all bits, subject to exceptions below. */
|
|
unsigned long securebits = SECURE_ALL_BITS | SECURE_ALL_LOCKS;
|
|
|
|
/*
|
|
* SECBIT_KEEP_CAPS is special in that it is automatically cleared on
|
|
* execve(2). This implies that attempts to set SECBIT_KEEP_CAPS (as is
|
|
* the default) in processes that have it locked already (such as nested
|
|
* minijail usage) would fail. Thus, unless the caller requires it,
|
|
* allow it to remain off if it is already locked.
|
|
*/
|
|
if (!require_keep_caps) {
|
|
int current_securebits = prctl(PR_GET_SECUREBITS);
|
|
if (current_securebits < 0) {
|
|
pwarn("prctl(PR_GET_SECUREBITS) failed");
|
|
return -1;
|
|
}
|
|
|
|
if ((current_securebits & SECBIT_KEEP_CAPS_LOCKED) != 0 &&
|
|
(current_securebits & SECBIT_KEEP_CAPS) == 0) {
|
|
securebits &= ~SECBIT_KEEP_CAPS;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Ambient capabilities can only be raised if they're already present
|
|
* in the permitted *and* inheritable set. Therefore, we don't really
|
|
* need to lock the NO_CAP_AMBIENT_RAISE securebit, since we are already
|
|
* configuring the permitted and inheritable set.
|
|
*/
|
|
securebits &=
|
|
~(SECBIT_NO_CAP_AMBIENT_RAISE | SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED);
|
|
|
|
/* Don't set any bits that the user requested not to be touched. */
|
|
securebits &= ~skip_mask;
|
|
|
|
if (!securebits) {
|
|
warn("not locking any securebits");
|
|
return 0;
|
|
}
|
|
int securebits_ret = prctl(PR_SET_SECUREBITS, securebits);
|
|
if (securebits_ret < 0) {
|
|
pwarn("prctl(PR_SET_SECUREBITS) failed");
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int write_proc_file(pid_t pid, const char *content, const char *basename)
|
|
{
|
|
attribute_cleanup_fd int fd = -1;
|
|
int ret;
|
|
size_t sz, len;
|
|
ssize_t written;
|
|
char filename[32];
|
|
|
|
sz = sizeof(filename);
|
|
ret = snprintf(filename, sz, "/proc/%d/%s", pid, basename);
|
|
if (ret < 0 || (size_t)ret >= sz) {
|
|
warn("failed to generate %s filename", basename);
|
|
return -1;
|
|
}
|
|
|
|
fd = open(filename, O_WRONLY | O_CLOEXEC);
|
|
if (fd < 0) {
|
|
pwarn("failed to open '%s'", filename);
|
|
return -errno;
|
|
}
|
|
|
|
len = strlen(content);
|
|
written = write(fd, content, len);
|
|
if (written < 0) {
|
|
pwarn("failed to write '%s'", filename);
|
|
return -errno;
|
|
}
|
|
|
|
if ((size_t)written < len) {
|
|
warn("failed to write %zu bytes to '%s'", len, filename);
|
|
return -1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* We specifically do not use cap_valid() as that only tells us the last
|
|
* valid cap we were *compiled* against (i.e. what the version of kernel
|
|
* headers says). If we run on a different kernel version, then it's not
|
|
* uncommon for that to be less (if an older kernel) or more (if a newer
|
|
* kernel).
|
|
* Normally, we suck up the answer via /proc. On Android, not all processes are
|
|
* guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
|
|
* programmatically find the value by calling prctl(PR_CAPBSET_READ).
|
|
*/
|
|
unsigned int get_last_valid_cap(void)
|
|
{
|
|
unsigned int last_valid_cap = 0;
|
|
if (is_android()) {
|
|
for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
|
|
++last_valid_cap)
|
|
;
|
|
|
|
/* |last_valid_cap| will be the first failing value. */
|
|
if (last_valid_cap > 0) {
|
|
last_valid_cap--;
|
|
}
|
|
} else {
|
|
static const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
|
|
FILE *fp = fopen(cap_file, "re");
|
|
if (!fp)
|
|
pdie("fopen(%s)", cap_file);
|
|
if (fscanf(fp, "%u", &last_valid_cap) != 1)
|
|
pdie("fscanf(%s)", cap_file);
|
|
fclose(fp);
|
|
}
|
|
return last_valid_cap;
|
|
}
|
|
|
|
int cap_ambient_supported(void)
|
|
{
|
|
return prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_CHOWN, 0, 0) >=
|
|
0;
|
|
}
|
|
|
|
int config_net_loopback(void)
|
|
{
|
|
const char ifname[] = "lo";
|
|
attribute_cleanup_fd int sock = -1;
|
|
struct ifreq ifr;
|
|
|
|
/* Make sure people don't try to add really long names. */
|
|
_Static_assert(sizeof(ifname) <= IFNAMSIZ, "interface name too long");
|
|
|
|
sock = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0);
|
|
if (sock < 0) {
|
|
pwarn("socket(AF_LOCAL) failed");
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Do the equiv of `ip link set up lo`. The kernel will assign
|
|
* IPv4 (127.0.0.1) & IPv6 (::1) addresses automatically!
|
|
*/
|
|
strcpy(ifr.ifr_name, ifname);
|
|
if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0) {
|
|
pwarn("ioctl(SIOCGIFFLAGS) failed");
|
|
return -1;
|
|
}
|
|
|
|
/* The kernel preserves ifr.ifr_name for use. */
|
|
ifr.ifr_flags |= IFF_UP | IFF_RUNNING;
|
|
if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0) {
|
|
pwarn("ioctl(SIOCSIFFLAGS) failed");
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int write_pid_to_path(pid_t pid, const char *path)
|
|
{
|
|
FILE *fp = fopen(path, "we");
|
|
|
|
if (!fp) {
|
|
pwarn("failed to open '%s'", path);
|
|
return -errno;
|
|
}
|
|
if (fprintf(fp, "%d\n", (int)pid) < 0) {
|
|
/* fprintf(3) does not set errno on failure. */
|
|
warn("fprintf(%s) failed", path);
|
|
fclose(fp);
|
|
return -1;
|
|
}
|
|
if (fclose(fp)) {
|
|
pwarn("fclose(%s) failed", path);
|
|
return -errno;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Create the |path| directory and its parents (if need be) with |mode|.
|
|
* If not |isdir|, then |path| is actually a file, so the last component
|
|
* will not be created.
|
|
*/
|
|
int mkdir_p(const char *path, mode_t mode, bool isdir)
|
|
{
|
|
int rc;
|
|
char *dir = strdup(path);
|
|
if (!dir) {
|
|
rc = errno;
|
|
pwarn("strdup(%s) failed", path);
|
|
return -rc;
|
|
}
|
|
|
|
/* Starting from the root, work our way out to the end. */
|
|
char *p = strchr(dir + 1, '/');
|
|
while (p) {
|
|
*p = '\0';
|
|
if (mkdir(dir, mode) && errno != EEXIST) {
|
|
rc = errno;
|
|
pwarn("mkdir(%s, 0%o) failed", dir, mode);
|
|
free(dir);
|
|
return -rc;
|
|
}
|
|
*p = '/';
|
|
p = strchr(p + 1, '/');
|
|
}
|
|
|
|
/*
|
|
* Create the last directory. We still check EEXIST here in case
|
|
* of trailing slashes.
|
|
*/
|
|
free(dir);
|
|
if (isdir && mkdir(path, mode) && errno != EEXIST) {
|
|
rc = errno;
|
|
pwarn("mkdir(%s, 0%o) failed", path, mode);
|
|
return -rc;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* setup_mount_destination: Ensures the mount target exists.
|
|
* Creates it if needed and possible.
|
|
*/
|
|
int setup_mount_destination(const char *source, const char *dest, uid_t uid,
|
|
uid_t gid, bool bind, unsigned long *mnt_flags)
|
|
{
|
|
int rc;
|
|
struct stat st_buf;
|
|
bool domkdir;
|
|
|
|
rc = stat(dest, &st_buf);
|
|
if (rc == 0) /* destination exists */
|
|
return 0;
|
|
|
|
/*
|
|
* Try to create the destination.
|
|
* Either make a directory or touch a file depending on the source type.
|
|
*
|
|
* If the source isn't an absolute path, assume it is a filesystem type
|
|
* such as "tmpfs" and create a directory to mount it on. The dest will
|
|
* be something like "none" or "proc" which we shouldn't be checking.
|
|
*/
|
|
if (source[0] == '/') {
|
|
/* The source is an absolute path -- it better exist! */
|
|
rc = stat(source, &st_buf);
|
|
if (rc) {
|
|
rc = errno;
|
|
pwarn("stat(%s) failed", source);
|
|
return -rc;
|
|
}
|
|
|
|
/*
|
|
* If bind mounting, we only create a directory if the source
|
|
* is a directory, else we always bind mount it as a file to
|
|
* support device nodes, sockets, etc...
|
|
*
|
|
* For all other mounts, we assume a block/char source is
|
|
* going to want a directory to mount to. If the source is
|
|
* something else (e.g. a fifo or socket), this probably will
|
|
* not do the right thing, but we'll fail later on when we try
|
|
* to mount(), so shouldn't be a big deal.
|
|
*/
|
|
domkdir = S_ISDIR(st_buf.st_mode) ||
|
|
(!bind && (S_ISBLK(st_buf.st_mode) ||
|
|
S_ISCHR(st_buf.st_mode)));
|
|
|
|
/* If bind mounting, also grab the mount flags of the source. */
|
|
if (bind && mnt_flags) {
|
|
struct statvfs stvfs_buf;
|
|
rc = statvfs(source, &stvfs_buf);
|
|
if (rc) {
|
|
rc = errno;
|
|
pwarn(
|
|
"failed to look up mount flags: source=%s",
|
|
source);
|
|
return -rc;
|
|
}
|
|
*mnt_flags = stvfs_buf.f_flag;
|
|
}
|
|
} else {
|
|
/* The source is a relative path -- assume it's a pseudo fs. */
|
|
|
|
/* Disallow relative bind mounts. */
|
|
if (bind) {
|
|
warn("relative bind-mounts are not allowed: source=%s",
|
|
source);
|
|
return -EINVAL;
|
|
}
|
|
|
|
domkdir = true;
|
|
}
|
|
|
|
/*
|
|
* Now that we know what we want to do, do it!
|
|
* We always create the intermediate dirs and the final path with 0755
|
|
* perms and root/root ownership. This shouldn't be a problem because
|
|
* the actual mount will set those perms/ownership on the mount point
|
|
* which is all people should need to access it.
|
|
*/
|
|
rc = mkdir_p(dest, 0755, domkdir);
|
|
if (rc)
|
|
return rc;
|
|
if (!domkdir) {
|
|
attribute_cleanup_fd int fd = open(
|
|
dest, O_RDWR | O_CREAT | O_CLOEXEC, 0700);
|
|
if (fd < 0) {
|
|
rc = errno;
|
|
pwarn("open(%s) failed", dest);
|
|
return -rc;
|
|
}
|
|
}
|
|
if (chown(dest, uid, gid)) {
|
|
rc = errno;
|
|
pwarn("chown(%s, %u, %u) failed", dest, uid, gid);
|
|
return -rc;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* lookup_user: Gets the uid/gid for the given username.
|
|
*/
|
|
int lookup_user(const char *user, uid_t *uid, gid_t *gid)
|
|
{
|
|
char *buf = NULL;
|
|
struct passwd pw;
|
|
struct passwd *ppw = NULL;
|
|
/*
|
|
* sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
|
|
* a suggested starting size for the buffer, so let's try getting this
|
|
* size first, and fallback to a default othersise.
|
|
*/
|
|
ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
|
|
if (sz == -1)
|
|
sz = 65536; /* your guess is as good as mine... */
|
|
|
|
do {
|
|
buf = malloc(sz);
|
|
if (!buf)
|
|
return -ENOMEM;
|
|
int err = getpwnam_r(user, &pw, buf, sz, &ppw);
|
|
/*
|
|
* We're safe to free the buffer here. The strings inside |pw|
|
|
* point inside |buf|, but we don't use any of them; this leaves
|
|
* the pointers dangling but it's safe.
|
|
* |ppw| points at |pw| if getpwnam_r(3) succeeded.
|
|
*/
|
|
free(buf);
|
|
if (err == ERANGE) {
|
|
/* |buf| was too small, retry with a bigger one. */
|
|
sz <<= 1;
|
|
} else if (err != 0) {
|
|
/* We got an error not related to the size of |buf|. */
|
|
return -err;
|
|
} else if (!ppw) {
|
|
/* Not found. */
|
|
return -ENOENT;
|
|
} else {
|
|
*uid = ppw->pw_uid;
|
|
*gid = ppw->pw_gid;
|
|
return 0;
|
|
}
|
|
} while (sz <= MAX_PWENT_SZ);
|
|
|
|
/* A buffer of size MAX_PWENT_SZ is still too small, return an error. */
|
|
return -ERANGE;
|
|
}
|
|
|
|
/*
|
|
* lookup_group: Gets the gid for the given group name.
|
|
*/
|
|
int lookup_group(const char *group, gid_t *gid)
|
|
{
|
|
char *buf = NULL;
|
|
struct group gr;
|
|
struct group *pgr = NULL;
|
|
/*
|
|
* sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
|
|
* a suggested starting size for the buffer, so let's try getting this
|
|
* size first, and fallback to a default otherwise.
|
|
*/
|
|
ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
|
|
if (sz == -1)
|
|
sz = 65536; /* and mine is as good as yours, really */
|
|
|
|
do {
|
|
buf = malloc(sz);
|
|
if (!buf)
|
|
return -ENOMEM;
|
|
int err = getgrnam_r(group, &gr, buf, sz, &pgr);
|
|
/*
|
|
* We're safe to free the buffer here. The strings inside |gr|
|
|
* point inside |buf|, but we don't use any of them; this leaves
|
|
* the pointers dangling but it's safe.
|
|
* |pgr| points at |gr| if getgrnam_r(3) succeeded.
|
|
*/
|
|
free(buf);
|
|
if (err == ERANGE) {
|
|
/* |buf| was too small, retry with a bigger one. */
|
|
sz <<= 1;
|
|
} else if (err != 0) {
|
|
/* We got an error not related to the size of |buf|. */
|
|
return -err;
|
|
} else if (!pgr) {
|
|
/* Not found. */
|
|
return -ENOENT;
|
|
} else {
|
|
*gid = pgr->gr_gid;
|
|
return 0;
|
|
}
|
|
} while (sz <= MAX_GRENT_SZ);
|
|
|
|
/* A buffer of size MAX_GRENT_SZ is still too small, return an error. */
|
|
return -ERANGE;
|
|
}
|
|
|
|
static bool seccomp_action_is_available(const char *wanted)
|
|
{
|
|
if (is_android()) {
|
|
/*
|
|
* Accessing |actions_avail| is generating SELinux denials, so
|
|
* skip for now.
|
|
* TODO(crbug.com/978022, jorgelo): Remove once the denial is
|
|
* fixed.
|
|
*/
|
|
return false;
|
|
}
|
|
const char actions_avail_path[] =
|
|
"/proc/sys/kernel/seccomp/actions_avail";
|
|
FILE *f = fopen(actions_avail_path, "re");
|
|
|
|
if (!f) {
|
|
pwarn("fopen(%s) failed", actions_avail_path);
|
|
return false;
|
|
}
|
|
|
|
attribute_cleanup_str char *actions_avail = NULL;
|
|
size_t buf_size = 0;
|
|
if (getline(&actions_avail, &buf_size, f) < 0) {
|
|
pwarn("getline() failed");
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* This is just substring search, which means that partial matches will
|
|
* match too (e.g. "action" would match "longaction"). There are no
|
|
* seccomp actions which include other actions though, so we're good for
|
|
* now. Eventually we might want to split the string by spaces.
|
|
*/
|
|
return strstr(actions_avail, wanted) != NULL;
|
|
}
|
|
|
|
int seccomp_ret_log_available(void)
|
|
{
|
|
static int ret_log_available = -1;
|
|
|
|
if (ret_log_available == -1)
|
|
ret_log_available = seccomp_action_is_available("log");
|
|
|
|
return ret_log_available;
|
|
}
|
|
|
|
int seccomp_ret_kill_process_available(void)
|
|
{
|
|
static int ret_kill_process_available = -1;
|
|
|
|
if (ret_kill_process_available == -1)
|
|
ret_kill_process_available =
|
|
seccomp_action_is_available("kill_process");
|
|
|
|
return ret_kill_process_available;
|
|
}
|
|
|
|
bool seccomp_filter_flags_available(unsigned int flags)
|
|
{
|
|
return sys_seccomp(SECCOMP_SET_MODE_FILTER, flags, NULL) != -1 ||
|
|
errno != EINVAL;
|
|
}
|