首页 > 其他分享 >rootless容器开发指北

rootless容器开发指北

时间:2024-11-01 15:33:17浏览次数:1  
标签:指北 容器 rootless BIND mount dev MS NULL proc

前言:

ruri前不久通过使用uidmap二进制的方式修好了rootless容器无法setgroups()的问题,差不多也该讲讲rootless容器的创建了。

rootless容器创建流程:

1.设置uidmap

我们可以通过读取/etc/subuid和/etc/subgid来获取uid_lower,uid_count和gid_lower,gid_count,
他们的格式为:foo:lower:count.
然后,我们的父进程fork出子进程,父进程unshare(CLONE_NEWUSER),子进程稍作等待,等父进程拥有了user ns后调用uidmap更改父进程的uidmap,然后父进程获得user ns中的root权限,继续容器创建过程。

pid_t pid_1 = fork();
if (pid_1 > 0) {
		// Enable user namespace.
		try_unshare(CLONE_NEWUSER);
		int stat = 0;
		waitpid(pid_1, &stat, 0);
	} else {
		// To ensure that unshare(2) finished in parent process.
		usleep(1000);
		int stat = try_setup_idmap(ppid, uid, gid);
		exit(stat);
	}

具体如何设置uidmap:

static int try_execvp(char *_Nonnull argv[])
{
	/*
	 * fork(2) and then execvp(3).
	 * Return the exit status of the child process.
	 */
	int pid = fork();
	if (pid == 0) {
		execvp(argv[0], argv);
		// If execvp(3) failed, exit as error status 114.
		exit(114);
	}
	int status = 0;
	waitpid(pid, &status, 0);
	return WEXITSTATUS(status);
}
static int try_setup_idmap(pid_t ppid, uid_t uid, gid_t gid)
{
	/*
	 * Try to use `uidmap` suid binary to setup uid and gid map.
	 * If this function failed, we will use set_id_map() to setup the id map.
	 */
	struct ID_MAP id_map = get_idmap(uid, gid);
	// Failed to get /etc/subuid or /etc/subgid config for current user.
	if (id_map.gid_lower == 0 || id_map.uid_lower == 0) {
		return -1;
	}
	char ppid_text[128] = { '\0' };
	char uid_text[128] = { '\0' };
	char uid_lower[128] = { '\0' };
	char uid_count[128] = { '\0' };
	char gid_text[128] = { '\0' };
	char gid_lower[128] = { '\0' };
	char gid_count[128] = { '\0' };
	sprintf(ppid_text, "%d", ppid);
	sprintf(uid_text, "%d", id_map.uid);
	sprintf(uid_lower, "%d", id_map.uid_lower);
	sprintf(uid_count, "%d", id_map.uid_count);
	sprintf(gid_text, "%d", id_map.gid);
	sprintf(gid_lower, "%d", id_map.gid_lower);
	sprintf(gid_count, "%d", id_map.gid_count);
	// In fact I don't know how it works, but it works.
	// newuidmap pid 0 uid 1 1 uid_lower uid_count
	char *newuidmap[] = { "newuidmap", ppid_text, "0", uid_text, "1", "1", uid_lower, uid_count, NULL };
	if (try_execvp(newuidmap) != 0) {
		return -1;
	}
	// newgidmap pid 0 gid 1 1 gid_lower gid_count
	char *newgidmap[] = { "newgidmap", ppid_text, "0", gid_text, "1", "1", gid_lower, gid_count, NULL };
	if (try_execvp(newgidmap) != 0) {
		return -1;
	}
	return 0;
}

2.获取其他namespace的所有权:

为了使用mount(2),我们还需要CLONE_NEWNS来成为当前mount ns的所有者。
为了挂载procfs,我们需要CLONE_NEWPID,很怪,但实测没有这个不行。

// We need to own mount namespace.
try_unshare(CLONE_NEWNS);
// Seems we need to own a new pid namespace for mount procfs.
try_unshare(CLONE_NEWPID);

记得fork()一次以让子进程加入创建的ns。

3.挂载目录:

sysfs从需要从宿主机挂载,由于我们无权mknod,/dev下设备需要从宿主机映射过去。

static void init_rootless_container(struct CONTAINER *_Nonnull container)
{
	/*
	 * For rootless container, the way to create/mount runtime dir/files is different.
	 * as we don't have the permission to mount sysfs and mknod(2),
	 * we need to bind-mount some dirs/files from host.
	 */
	chdir(container->container_dir);
	mkdir("./sys", S_IRUSR | S_IWUSR | S_IROTH | S_IWOTH | S_IRGRP | S_IWGRP);
	mount("/sys", "./sys", NULL, MS_BIND | MS_REC, NULL);
	mkdir("./proc", S_IRUSR | S_IWUSR | S_IROTH | S_IWOTH | S_IRGRP | S_IWGRP);
	mount("proc", "./proc", "proc", MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL);
	mkdir("./dev", S_IRUSR | S_IWUSR | S_IROTH | S_IWOTH | S_IRGRP | S_IWGRP);
	mount("tmpfs", "./dev", "tmpfs", MS_NOSUID, "size=65536k,mode=755");
	close(open("./dev/tty", O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, S_IRUSR | S_IWUSR | S_IROTH | S_IWOTH | S_IRGRP | S_IWGRP));
	mount("/dev/tty", "./dev/tty", NULL, MS_BIND, NULL);
	close(open("./dev/console", O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, S_IRUSR | S_IWUSR | S_IROTH | S_IWOTH | S_IRGRP | S_IWGRP));
	mount("/dev/console", "./dev/console", NULL, MS_BIND, NULL);
	close(open("./dev/null", O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, S_IRUSR | S_IWUSR | S_IROTH | S_IWOTH | S_IRGRP | S_IWGRP));
	mount("/dev/null", "./dev/null", NULL, MS_BIND, NULL);
	close(open("./dev/ptmx", O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, S_IRUSR | S_IWUSR | S_IROTH | S_IWOTH | S_IRGRP | S_IWGRP));
	mount("/dev/ptmx", "./dev/ptmx", NULL, MS_BIND, NULL);
	close(open("./dev/random", O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, S_IRUSR | S_IWUSR | S_IROTH | S_IWOTH | S_IRGRP | S_IWGRP));
	mount("/dev/random", "./dev/random", NULL, MS_BIND, NULL);
	close(open("./dev/urandom", O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, S_IRUSR | S_IWUSR | S_IROTH | S_IWOTH | S_IRGRP | S_IWGRP));
	mount("/dev/urandom", "./dev/urandom", NULL, MS_BIND, NULL);
	close(open("./dev/zero", O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, S_IRUSR | S_IWUSR | S_IROTH | S_IWOTH | S_IRGRP | S_IWGRP));
	mount("/dev/zero", "./dev/zero", NULL, MS_BIND, NULL);
	symlink("/proc/self/fd", "./dev/fd");
	symlink("/proc/self/fd/0", "./dev/stdin");
	symlink("/proc/self/fd/1", "./dev/stdout");
	symlink("/proc/self/fd/2", "./dev/stderr");
	symlink("/dev/null", "./dev/tty0");
	mkdir("./dev/pts", S_IRUSR | S_IWUSR | S_IROTH | S_IWOTH | S_IRGRP | S_IWGRP);
	mount("devpts", "./dev/pts", "devpts", 0, "gid=4,mode=620");
	mkdir("./dev/shm", S_IRUSR | S_IWUSR | S_IROTH | S_IWOTH | S_IRGRP | S_IWGRP);
	mount("tmpfs", "./dev/shm", "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV, "mode=1777");
	mount("./proc/bus", "./proc/bus", NULL, MS_BIND | MS_REC, NULL);
	mount("./proc/bus", "./proc/bus", NULL, MS_BIND | MS_RDONLY | MS_REMOUNT, NULL);
	mount("./proc/fs", "./proc/fs", NULL, MS_BIND | MS_REC, NULL);
	mount("./proc/fs", "./proc/fs", NULL, MS_BIND | MS_RDONLY | MS_REMOUNT, NULL);
	mount("./proc/irq", "./proc/irq", NULL, MS_BIND | MS_REC, NULL);
	mount("./proc/irq", "./proc/irq", NULL, MS_BIND | MS_RDONLY | MS_REMOUNT, NULL);
	mount("./proc/sys", "./proc/sys", NULL, MS_BIND | MS_REC, NULL);
	mount("./proc/sys", "./proc/sys", NULL, MS_BIND | MS_RDONLY | MS_REMOUNT, NULL);
	mount("./proc/sys-trigger", "./proc/sys-trigger", NULL, MS_BIND | MS_REC, NULL);
	mount("./proc/sys-trigger", "./proc/sys-trigger", NULL, MS_BIND | MS_RDONLY | MS_REMOUNT, NULL);
	mount("tmpfs", "./proc/asound", "tmpfs", MS_RDONLY, NULL);
	mount("tmpfs", "./proc/acpi", "tmpfs", MS_RDONLY, NULL);
	mount("/dev/null", "./proc/kcore", "", MS_BIND, NULL);
	mount("/dev/null", "./proc/keys", "", MS_BIND, NULL);
	mount("/dev/null", "./proc/latency_stats", "", MS_BIND, NULL);
	mount("/dev/null", "./proc/timer_list", "", MS_BIND, NULL);
	mount("/dev/null", "./proc/timer_stats", "", MS_BIND, NULL);
	mount("/dev/null", "./proc/sched_debug", "", MS_BIND, NULL);
	mount("tmpfs", "./proc/scsi", "tmpfs", MS_RDONLY, NULL);
	mount("tmpfs", "./sys/firmware", "tmpfs", MS_RDONLY, NULL);
	mount("tmpfs", "./sys/devices/virtual/powercap", "tmpfs", MS_RDONLY, NULL);
	mount("tmpfs", "./sys/block", "tmpfs", MS_RDONLY, NULL);
}

4.chroot并设置capabilitys:

chdir(container->container_dir);
chroot(".");
chdir("/");
// Fix /etc/mtab.
remove("/etc/mtab");
unlink("/etc/mtab");
symlink("/proc/mounts", "/etc/mtab");
drop_caps(container);
usleep(2000);
static void drop_caps(const struct CONTAINER *_Nonnull container)
{
	/*
	 * Drop CapBnd and CapAmb as the config in container->drop_caplist[].
	 * And clear CapInh.
	 * It will be called after chroot(2).
	 */
	for (int i = 0; i < CAP_LAST_CAP + 1; i++) {
		// INIT_VALUE is the end of drop_caplist[].
		if (container->drop_caplist[i] == INIT_VALUE) {
			break;
		}
		// Check if the capability is supported in the kernel,
		// so that we can avoid unnecessary warnings.
		if (CAP_IS_SUPPORTED(container->drop_caplist[i])) {
			// Drop CapBnd.
			if (cap_drop_bound(container->drop_caplist[i]) != 0 && !container->no_warnings) {
				warning("{yellow}Warning: Failed to drop cap `%s`\n", cap_to_name(container->drop_caplist[i]));
				warning("{yellow}error reason: %s{clear}\n", strerror(errno));
			}
			// Drop CapAmb.
			prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, container->drop_caplist[i], 0, 0);
		}
	}
	// Clear CapInh.
	// hrdp and datap are two pointers, so we malloc() to apply the memory for it first.
	cap_user_header_t hrdp = (cap_user_header_t)malloc(sizeof(typeof(*hrdp)));
	cap_user_data_t datap = (cap_user_data_t)malloc(sizeof(typeof(*datap)));
	hrdp->pid = getpid();
	hrdp->version = _LINUX_CAPABILITY_VERSION_3;
	capget(hrdp, datap);
	datap->inheritable = 0;
	capset(hrdp, datap);
// free(2) hrdp and datap here might cause ASAN error.
// I think it's because the kernel will write to the memory directly,
// but I don't know the behavior of ASAN to allocate memory, maybe after this,
// ASAN can not recognize the memory is allocated by it.
#ifndef RURI_DEBUG
	free(hrdp);
	free(datap);
#endif
}

5.运行命令:

execvp(container->command[0], container->command);

于是我们完成了rootless容器的创建和运行。

问题修复:

在解压rootfs时如果遇到权限问题,可以使用proot -0 tar -xxxxx来解压。
如果在容器中遇到Couldn't create temporary file /tmp/apt.conf.sIKx3J for passing config to apt-key这样的错误,请chmod 777 /tmp

标签:指北,容器,rootless,BIND,mount,dev,MS,NULL,proc
From: https://www.cnblogs.com/Moe-hacker/p/18520366

相关文章

  • 华为云企业主机安全检测升级,再添容器安全新翼
    来源:本文分享自华为云开发者社区《必看!HSS检测增强这2个新特性,你一定要知道!》,原文作者:阅识风云阅识风云是华为云信息大咖,擅长将复杂信息多元化呈现,其出品的一张图(云图说)、深入浅出的博文(云小课)或短视频(云视厅)总有一款能让您快速上手华为云。更多精彩内容请单击此处。华为......
  • 在K8S中,创建init c容器后其状态不正常 如何解决?
    在Kubernetes(K8S)中,如果在创建init(初始化)容器后其状态不正常,这通常意味着初始化容器在执行过程中遇到了问题。为了解决这个问题,可以按照以下步骤进行详细的排查和解决:1.查看Pod状态和日志查看Pod状态:使用kubectlgetpods命令查看Pod的状态,特别注意init容器的状态。如果init......
  • Debian 基础镜像与容器构建最佳实践指南
    引言在容器化应用开发中,选择合适的基础镜像至关重要。Debian作为一个稳定、安全且广受欢迎的Linux发行版,其官方Docker镜像成为了许多开发者的首选。本文将深入探讨Debian基础镜像的类型、特点,以及如何在容器构建中有效地使用它们。1.Debian基础镜像概述Debian......
  • Prometheus03 Prometheus服务发现, 各种exporter, 容器化监控, Federation联邦, Victo
    6服务发现6.1服务发现原理6.2文件服务发现#准备主机节点列表文件,可以支持yaml格式和json格式#注意:此文件不建议就地编写生成,可能出现加载一部分的情况cattargets/prometheus*.yaml-targets:-master1:9100labels:app:prometheus#修改prometheus配置......
  • docker容器安装nacos详解
    ‌Nacos的核心功能‌Nacos是一个动态服务发现、配置管理和服务管理平台,旨在帮助构建云原生应用。它支持服务注册与发现、配置管理、‌服务健康监测等功能,适用于微服务和云原生架构。Nacos提供了友好的‌Web界面和‌API接口,方便用户进行配置管理、服务注册和发现等操作。1.打......
  • 基于STL的自定义栈与队列实现:灵活选择底层容器的模板设计
    文章目录代码模板设计主要成员函数底层容器的选择模板设计底层容器的选择关于stack的示例代码关于queue的示例代码前言:在本文中,我们将分析一个模拟C++标准模板库(STL)栈的自定义实现以及模仿C++标准模板库(STL)队列的自定义实现。该stack类模板允许在底层容器的选择......
  • K8s 容器的定向调度与亲和性
    K8s容器的定向调度与亲和性K8s集群节点CPU使用率高!内存溢出(OOM)!宕机!导致大量微服务瘫痪怎么办?可能是调度策略没做好,看完这篇文章掌握提高集群稳定性的管理诀窍。Kubernetes(K8s)是一个开源的容器编排工具,而容器调度是其非常重要的特性,所谓的调度是指将容器(Pod)分配到集群中的......
  • WPF重写了ListView的ItemsPanel,改用WrapPanel做容器。不能自动换行问题
    直接上正确代码:1<ListViewx:Name="lv_product"HorizontalContentAlignment="Stretch"ItemsSource="{BindingProducts}"2ScrollViewer.HorizontalScrollBarVisibility="Disabled"ScrollViewer.VerticalScrollB......
  • k8s之调动pod到指定节点与创建多容器pod并查找pod日志
    在Kubernetes中,可以通过以下步骤将Pod调度到指定节点、创建多容器Pod,并查找Pod日志。1.将Pod调度到指定节点要将Pod调度到特定节点,可以使用nodeSelector或nodeAffinity进行调度。方法一:使用nodeSelector首先,需要确保节点具有指定的标签,然后在Pod配置......
  • 如何在PVE环境中安装Docker:选择LXC容器还是虚拟机方案
    原文参考:https://www.oryoy.com/news/rong-qi-hua 在当今的云计算和虚拟化技术中,ProxmoxVE(PVE)作为一种强大的开源虚拟化平台,受到了许多系统管理员和开发者的青睐。而在PVE环境中安装Docker,成为了许多用户的需求。本文将详细探讨在PVE环境中安装Docker的两种主要方案:使用LXC容......