Commit 05c6ca85 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'x86-urgent-2022-06-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Thomas Gleixner:

 - Make RESERVE_BRK() work again with older binutils. The recent
   'simplification' broke that.

 - Make early #VE handling increment RIP when successful.

 - Make the #VE code consistent vs. the RIP adjustments and add
   comments.

 - Handle load_unaligned_zeropad() across page boundaries correctly in
   #VE when the second page is shared.

* tag 'x86-urgent-2022-06-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/tdx: Handle load_unaligned_zeropad() page-cross to a shared page
  x86/tdx: Clarify RIP adjustments in #VE handler
  x86/tdx: Fix early #VE handling
  x86/mm: Fix RESERVE_BRK() for older binutils
parents 5d770f11 1e776965
Loading
Loading
Loading
Loading
+136 −51
Original line number Diff line number Diff line
@@ -124,6 +124,51 @@ static u64 get_cc_mask(void)
	return BIT_ULL(gpa_width - 1);
}

/*
 * The TDX module spec states that #VE may be injected for a limited set of
 * reasons:
 *
 *  - Emulation of the architectural #VE injection on EPT violation;
 *
 *  - As a result of guest TD execution of a disallowed instruction,
 *    a disallowed MSR access, or CPUID virtualization;
 *
 *  - A notification to the guest TD about anomalous behavior;
 *
 * The last one is opt-in and is not used by the kernel.
 *
 * The Intel Software Developer's Manual describes cases when instruction
 * length field can be used in section "Information for VM Exits Due to
 * Instruction Execution".
 *
 * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
 * information if #VE occurred due to instruction execution, but not for EPT
 * violations.
 */
static int ve_instr_len(struct ve_info *ve)
{
	switch (ve->exit_reason) {
	case EXIT_REASON_HLT:
	case EXIT_REASON_MSR_READ:
	case EXIT_REASON_MSR_WRITE:
	case EXIT_REASON_CPUID:
	case EXIT_REASON_IO_INSTRUCTION:
		/* It is safe to use ve->instr_len for #VE due instructions */
		return ve->instr_len;
	case EXIT_REASON_EPT_VIOLATION:
		/*
		 * For EPT violations, ve->insn_len is not defined. For those,
		 * the kernel must decode instructions manually and should not
		 * be using this function.
		 */
		WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
		return 0;
	default:
		WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
		return ve->instr_len;
	}
}

static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
{
	struct tdx_hypercall_args args = {
@@ -147,7 +192,7 @@ static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
	return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0);
}

static bool handle_halt(void)
static int handle_halt(struct ve_info *ve)
{
	/*
	 * Since non safe halt is mainly used in CPU offlining
@@ -158,9 +203,9 @@ static bool handle_halt(void)
	const bool do_sti = false;

	if (__halt(irq_disabled, do_sti))
		return false;
		return -EIO;

	return true;
	return ve_instr_len(ve);
}

void __cpuidle tdx_safe_halt(void)
@@ -180,7 +225,7 @@ void __cpuidle tdx_safe_halt(void)
		WARN_ONCE(1, "HLT instruction emulation failed\n");
}

static bool read_msr(struct pt_regs *regs)
static int read_msr(struct pt_regs *regs, struct ve_info *ve)
{
	struct tdx_hypercall_args args = {
		.r10 = TDX_HYPERCALL_STANDARD,
@@ -194,14 +239,14 @@ static bool read_msr(struct pt_regs *regs)
	 * (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
	 */
	if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
		return false;
		return -EIO;

	regs->ax = lower_32_bits(args.r11);
	regs->dx = upper_32_bits(args.r11);
	return true;
	return ve_instr_len(ve);
}

static bool write_msr(struct pt_regs *regs)
static int write_msr(struct pt_regs *regs, struct ve_info *ve)
{
	struct tdx_hypercall_args args = {
		.r10 = TDX_HYPERCALL_STANDARD,
@@ -215,10 +260,13 @@ static bool write_msr(struct pt_regs *regs)
	 * can be found in TDX Guest-Host-Communication Interface
	 * (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
	 */
	return !__tdx_hypercall(&args, 0);
	if (__tdx_hypercall(&args, 0))
		return -EIO;

	return ve_instr_len(ve);
}

static bool handle_cpuid(struct pt_regs *regs)
static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
{
	struct tdx_hypercall_args args = {
		.r10 = TDX_HYPERCALL_STANDARD,
@@ -236,7 +284,7 @@ static bool handle_cpuid(struct pt_regs *regs)
	 */
	if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
		regs->ax = regs->bx = regs->cx = regs->dx = 0;
		return true;
		return ve_instr_len(ve);
	}

	/*
@@ -245,7 +293,7 @@ static bool handle_cpuid(struct pt_regs *regs)
	 * (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
	 */
	if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
		return false;
		return -EIO;

	/*
	 * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
@@ -257,7 +305,7 @@ static bool handle_cpuid(struct pt_regs *regs)
	regs->cx = args.r14;
	regs->dx = args.r15;

	return true;
	return ve_instr_len(ve);
}

static bool mmio_read(int size, unsigned long addr, unsigned long *val)
@@ -283,10 +331,10 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
			       EPT_WRITE, addr, val);
}

static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
{
	unsigned long *reg, val, vaddr;
	char buffer[MAX_INSN_SIZE];
	unsigned long *reg, val;
	struct insn insn = {};
	enum mmio_type mmio;
	int size, extend_size;
@@ -294,34 +342,49 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)

	/* Only in-kernel MMIO is supported */
	if (WARN_ON_ONCE(user_mode(regs)))
		return false;
		return -EFAULT;

	if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
		return false;
		return -EFAULT;

	if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
		return false;
		return -EINVAL;

	mmio = insn_decode_mmio(&insn, &size);
	if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED))
		return false;
		return -EINVAL;

	if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
		reg = insn_get_modrm_reg_ptr(&insn, regs);
		if (!reg)
			return false;
			return -EINVAL;
	}

	ve->instr_len = insn.length;
	/*
	 * Reject EPT violation #VEs that split pages.
	 *
	 * MMIO accesses are supposed to be naturally aligned and therefore
	 * never cross page boundaries. Seeing split page accesses indicates
	 * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
	 *
	 * load_unaligned_zeropad() will recover using exception fixups.
	 */
	vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
	if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
		return -EFAULT;

	/* Handle writes first */
	switch (mmio) {
	case MMIO_WRITE:
		memcpy(&val, reg, size);
		return mmio_write(size, ve->gpa, val);
		if (!mmio_write(size, ve->gpa, val))
			return -EIO;
		return insn.length;
	case MMIO_WRITE_IMM:
		val = insn.immediate.value;
		return mmio_write(size, ve->gpa, val);
		if (!mmio_write(size, ve->gpa, val))
			return -EIO;
		return insn.length;
	case MMIO_READ:
	case MMIO_READ_ZERO_EXTEND:
	case MMIO_READ_SIGN_EXTEND:
@@ -334,15 +397,15 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
		 * decoded or handled properly. It was likely not using io.h
		 * helpers or accessed MMIO accidentally.
		 */
		return false;
		return -EINVAL;
	default:
		WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
		return false;
		return -EINVAL;
	}

	/* Handle reads */
	if (!mmio_read(size, ve->gpa, &val))
		return false;
		return -EIO;

	switch (mmio) {
	case MMIO_READ:
@@ -364,13 +427,13 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
	default:
		/* All other cases has to be covered with the first switch() */
		WARN_ON_ONCE(1);
		return false;
		return -EINVAL;
	}

	if (extend_size)
		memset(reg, extend_val, extend_size);
	memcpy(reg, &val, size);
	return true;
	return insn.length;
}

static bool handle_in(struct pt_regs *regs, int size, int port)
@@ -421,13 +484,14 @@ static bool handle_out(struct pt_regs *regs, int size, int port)
 *
 * Return True on success or False on failure.
 */
static bool handle_io(struct pt_regs *regs, u32 exit_qual)
static int handle_io(struct pt_regs *regs, struct ve_info *ve)
{
	u32 exit_qual = ve->exit_qual;
	int size, port;
	bool in;
	bool in, ret;

	if (VE_IS_IO_STRING(exit_qual))
		return false;
		return -EIO;

	in   = VE_IS_IO_IN(exit_qual);
	size = VE_GET_IO_SIZE(exit_qual);
@@ -435,9 +499,13 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual)


	if (in)
		return handle_in(regs, size, port);
		ret = handle_in(regs, size, port);
	else
		return handle_out(regs, size, port);
		ret = handle_out(regs, size, port);
	if (!ret)
		return -EIO;

	return ve_instr_len(ve);
}

/*
@@ -447,13 +515,19 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual)
__init bool tdx_early_handle_ve(struct pt_regs *regs)
{
	struct ve_info ve;
	int insn_len;

	tdx_get_ve_info(&ve);

	if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
		return false;

	return handle_io(regs, ve.exit_qual);
	insn_len = handle_io(regs, &ve);
	if (insn_len < 0)
		return false;

	regs->ip += insn_len;
	return true;
}

void tdx_get_ve_info(struct ve_info *ve)
@@ -486,54 +560,65 @@ void tdx_get_ve_info(struct ve_info *ve)
	ve->instr_info  = upper_32_bits(out.r10);
}

/* Handle the user initiated #VE */
static bool virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
/*
 * Handle the user initiated #VE.
 *
 * On success, returns the number of bytes RIP should be incremented (>=0)
 * or -errno on error.
 */
static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
{
	switch (ve->exit_reason) {
	case EXIT_REASON_CPUID:
		return handle_cpuid(regs);
		return handle_cpuid(regs, ve);
	default:
		pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
		return false;
		return -EIO;
	}
}

/* Handle the kernel #VE */
static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
/*
 * Handle the kernel #VE.
 *
 * On success, returns the number of bytes RIP should be incremented (>=0)
 * or -errno on error.
 */
static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
{
	switch (ve->exit_reason) {
	case EXIT_REASON_HLT:
		return handle_halt();
		return handle_halt(ve);
	case EXIT_REASON_MSR_READ:
		return read_msr(regs);
		return read_msr(regs, ve);
	case EXIT_REASON_MSR_WRITE:
		return write_msr(regs);
		return write_msr(regs, ve);
	case EXIT_REASON_CPUID:
		return handle_cpuid(regs);
		return handle_cpuid(regs, ve);
	case EXIT_REASON_EPT_VIOLATION:
		return handle_mmio(regs, ve);
	case EXIT_REASON_IO_INSTRUCTION:
		return handle_io(regs, ve->exit_qual);
		return handle_io(regs, ve);
	default:
		pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
		return false;
		return -EIO;
	}
}

bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
{
	bool ret;
	int insn_len;

	if (user_mode(regs))
		ret = virt_exception_user(regs, ve);
		insn_len = virt_exception_user(regs, ve);
	else
		ret = virt_exception_kernel(regs, ve);
		insn_len = virt_exception_kernel(regs, ve);
	if (insn_len < 0)
		return false;

	/* After successful #VE handling, move the IP */
	if (ret)
		regs->ip += ve->instr_len;
	regs->ip += insn_len;

	return ret;
	return true;
}

static bool tdx_tlb_flush_required(bool private)
+21 −17
Original line number Diff line number Diff line
@@ -108,19 +108,16 @@ extern unsigned long _brk_end;
void *extend_brk(size_t size, size_t align);

/*
 * Reserve space in the brk section.  The name must be unique within the file,
 * and somewhat descriptive.  The size is in bytes.
 * Reserve space in the .brk section, which is a block of memory from which the
 * caller is allowed to allocate very early (before even memblock is available)
 * by calling extend_brk().  All allocated memory will be eventually converted
 * to memblock.  Any leftover unallocated memory will be freed.
 *
 * The allocation is done using inline asm (rather than using a section
 * attribute on a normal variable) in order to allow the use of @nobits, so
 * that it doesn't take up any space in the vmlinux file.
 * The size is in bytes.
 */
#define RESERVE_BRK(name, size)					\
	asm(".pushsection .brk_reservation,\"aw\",@nobits\n\t"		\
	    ".brk." #name ":\n\t"					\
	    ".skip " __stringify(size) "\n\t"				\
	    ".size .brk." #name ", " __stringify(size) "\n\t"		\
	    ".popsection\n\t")
	__section(".bss..brk") __aligned(1) __used	\
	static char __brk_##name[size]

extern void probe_roms(void);
#ifdef __i386__
@@ -133,12 +130,19 @@ asmlinkage void __init x86_64_start_reservations(char *real_mode_data);

#endif /* __i386__ */
#endif /* _SETUP */
#else
#define RESERVE_BRK(name,sz)				\
	.pushsection .brk_reservation,"aw",@nobits;	\
.brk.name:						\
1:	.skip sz;					\
	.size .brk.name,.-1b;				\

#else  /* __ASSEMBLY */

.macro __RESERVE_BRK name, size
	.pushsection .bss..brk, "aw"
SYM_DATA_START(__brk_\name)
	.skip \size
SYM_DATA_END(__brk_\name)
	.popsection
.endm

#define RESERVE_BRK(name, size) __RESERVE_BRK name, size

#endif /* __ASSEMBLY__ */

#endif /* _ASM_X86_SETUP_H */
+0 −5
Original line number Diff line number Diff line
@@ -67,11 +67,6 @@ RESERVE_BRK(dmi_alloc, 65536);
#endif


/*
 * Range of the BSS area. The size of the BSS area is determined
 * at link time, with RESERVE_BRK() facility reserving additional
 * chunks.
 */
unsigned long _brk_start = (unsigned long)__brk_base;
unsigned long _brk_end   = (unsigned long)__brk_base;

+2 −2
Original line number Diff line number Diff line
@@ -385,10 +385,10 @@ SECTIONS
	__end_of_kernel_reserve = .;

	. = ALIGN(PAGE_SIZE);
	.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
	.brk (NOLOAD) : AT(ADDR(.brk) - LOAD_OFFSET) {
		__brk_base = .;
		. += 64 * 1024;		/* 64k alignment slop space */
		*(.brk_reservation)	/* areas brk users have reserved */
		*(.bss..brk)		/* areas brk users have reserved */
		__brk_limit = .;
	}