Commit 1423e266 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'x86-fpu-2021-07-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fpu updates from Thomas Gleixner:
 "Fixes and improvements for FPU handling on x86:

   - Prevent sigaltstack out of bounds writes.

     The kernel unconditionally writes the FPU state to the alternate
     stack without checking whether the stack is large enough to
     accomodate it.

     Check the alternate stack size before doing so and in case it's too
     small force a SIGSEGV instead of silently corrupting user space
     data.

   - MINSIGSTKZ and SIGSTKSZ are constants in signal.h and have never
     been updated despite the fact that the FPU state which is stored on
     the signal stack has grown over time which causes trouble in the
     field when AVX512 is available on a CPU. The kernel does not expose
     the minimum requirements for the alternate stack size depending on
     the available and enabled CPU features.

     ARM already added an aux vector AT_MINSIGSTKSZ for the same reason.
     Add it to x86 as well.

   - A major cleanup of the x86 FPU code. The recent discoveries of
     XSTATE related issues unearthed quite some inconsistencies,
     duplicated code and other issues.

     The fine granular overhaul addresses this, makes the code more
     robust and maintainable, which allows to integrate upcoming XSTATE
     related features in sane ways"

* tag 'x86-fpu-2021-07-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (74 commits)
  x86/fpu/xstate: Clear xstate header in copy_xstate_to_uabi_buf() again
  x86/fpu/signal: Let xrstor handle the features to init
  x86/fpu/signal: Handle #PF in the direct restore path
  x86/fpu: Return proper error codes from user access functions
  x86/fpu/signal: Split out the direct restore code
  x86/fpu/signal: Sanitize copy_user_to_fpregs_zeroing()
  x86/fpu/signal: Sanitize the xstate check on sigframe
  x86/fpu/signal: Remove the legacy alignment check
  x86/fpu/signal: Move initial checks into fpu__restore_sig()
  x86/fpu: Mark init_fpstate __ro_after_init
  x86/pkru: Remove xstate fiddling from write_pkru()
  x86/fpu: Don't store PKRU in xstate in fpu_reset_fpstate()
  x86/fpu: Remove PKRU handling from switch_fpu_finish()
  x86/fpu: Mask PKRU from kernel XRSTOR[S] operations
  x86/fpu: Hook up PKRU into ptrace()
  x86/fpu: Add PKRU storage outside of task XSAVE buffer
  x86/fpu: Dont restore PKRU in fpregs_restore_userspace()
  x86/fpu: Rename xfeatures_mask_user() to xfeatures_mask_uabi()
  x86/fpu: Move FXSAVE_LEAK quirk info __copy_kernel_to_fpregs()
  x86/fpu: Rename __fpregs_load_activate() to fpregs_restore_userregs()
  ...
parents 4ea90317 93c2cdc9
Loading
Loading
Loading
Loading
+53 −0
Original line number Diff line number Diff line
.. SPDX-License-Identifier: GPL-2.0

==================================
x86-specific ELF Auxiliary Vectors
==================================

This document describes the semantics of the x86 auxiliary vectors.

Introduction
============

ELF Auxiliary vectors enable the kernel to efficiently provide
configuration-specific parameters to userspace. In this example, a program
allocates an alternate stack based on the kernel-provided size::

   #include <sys/auxv.h>
   #include <elf.h>
   #include <signal.h>
   #include <stdlib.h>
   #include <assert.h>
   #include <err.h>

   #ifndef AT_MINSIGSTKSZ
   #define AT_MINSIGSTKSZ	51
   #endif

   ....
   stack_t ss;

   ss.ss_sp = malloc(ss.ss_size);
   assert(ss.ss_sp);

   ss.ss_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
   ss.ss_flags = 0;

   if (sigaltstack(&ss, NULL))
        err(1, "sigaltstack");


The exposed auxiliary vectors
=============================

AT_SYSINFO is used for locating the vsyscall entry point.  It is not
exported on 64-bit mode.

AT_SYSINFO_EHDR is the start address of the page containing the vDSO.

AT_MINSIGSTKSZ denotes the minimum stack size required by the kernel to
deliver a signal to user-space.  AT_MINSIGSTKSZ comprehends the space
consumed by the kernel to accommodate the user context for the current
hardware configuration.  It does not comprehend subsequent user-space stack
consumption, which must be added by the user.  (e.g. Above, user-space adds
SIGSTKSZ to AT_MINSIGSTKSZ.)
+1 −0
Original line number Diff line number Diff line
@@ -36,3 +36,4 @@ x86-specific Documentation
   sva
   sgx
   features
   elf_auxvec
+3 −3
Original line number Diff line number Diff line
@@ -491,7 +491,7 @@ static void intel_pmu_arch_lbr_xrstors(void *ctx)
{
	struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;

	copy_kernel_to_dynamic_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR);
	xrstors(&task_ctx->xsave, XFEATURE_MASK_LBR);
}

static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
@@ -576,7 +576,7 @@ static void intel_pmu_arch_lbr_xsaves(void *ctx)
{
	struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;

	copy_dynamic_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR);
	xsaves(&task_ctx->xsave, XFEATURE_MASK_LBR);
}

static void __intel_pmu_lbr_save(void *ctx)
@@ -993,7 +993,7 @@ static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc)
		intel_pmu_store_lbr(cpuc, NULL);
		return;
	}
	copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR);
	xsaves(&xsave->xsave, XFEATURE_MASK_LBR);

	intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
}
+4 −0
Original line number Diff line number Diff line
@@ -312,6 +312,7 @@ do { \
		NEW_AUX_ENT(AT_SYSINFO,	VDSO_ENTRY);			\
		NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE);	\
	}								\
	NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());		\
} while (0)

/*
@@ -328,6 +329,7 @@ extern unsigned long task_size_32bit(void);
extern unsigned long task_size_64bit(int full_addr_space);
extern unsigned long get_mmap_base(int is_legacy);
extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
extern unsigned long get_sigframe_size(void);

#ifdef CONFIG_X86_32

@@ -349,6 +351,7 @@ do { \
	if (vdso64_enabled)						\
		NEW_AUX_ENT(AT_SYSINFO_EHDR,				\
			    (unsigned long __force)current->mm->context.vdso); \
	NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());		\
} while (0)

/* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */
@@ -357,6 +360,7 @@ do { \
	if (vdso64_enabled)						\
		NEW_AUX_ENT(AT_SYSINFO_EHDR,				\
			    (unsigned long __force)current->mm->context.vdso); \
	NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size());		\
} while (0)

#define AT_SYSINFO		32
+80 −122
Original line number Diff line number Diff line
@@ -26,16 +26,17 @@
/*
 * High level FPU state handling functions:
 */
extern void fpu__prepare_read(struct fpu *fpu);
extern void fpu__prepare_write(struct fpu *fpu);
extern void fpu__save(struct fpu *fpu);
extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
extern void fpu__drop(struct fpu *fpu);
extern int  fpu__copy(struct task_struct *dst, struct task_struct *src);
extern void fpu__clear_user_states(struct fpu *fpu);
extern void fpu__clear_all(struct fpu *fpu);
extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);

extern void fpu_sync_fpstate(struct fpu *fpu);

/* Clone and exit operations */
extern int  fpu_clone(struct task_struct *dst);
extern void fpu_flush_thread(void);

/*
 * Boot time FPU initialization functions:
 */
@@ -45,7 +46,6 @@ extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(struct cpuinfo_x86 *c);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
extern u64 fpu__get_supported_xfeatures_mask(void);

/*
 * Debugging facility:
@@ -86,23 +86,9 @@ extern void fpstate_init_soft(struct swregs_state *soft);
#else
static inline void fpstate_init_soft(struct swregs_state *soft) {}
#endif
extern void save_fpregs_to_fpstate(struct fpu *fpu);

static inline void fpstate_init_xstate(struct xregs_state *xsave)
{
	/*
	 * XRSTORS requires these bits set in xcomp_bv, or it will
	 * trigger #GP:
	 */
	xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
}

static inline void fpstate_init_fxstate(struct fxregs_state *fx)
{
	fx->cwd = 0x37f;
	fx->mxcsr = MXCSR_DEFAULT;
}
extern void fpstate_sanitize_xstate(struct fpu *fpu);

/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
#define user_insn(insn, output, input...)				\
({									\
	int err;							\
@@ -110,14 +96,14 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu);
	might_fault();							\
									\
	asm volatile(ASM_STAC "\n"					\
		     "1:" #insn "\n\t"					\
		     "1: " #insn "\n"					\
		     "2: " ASM_CLAC "\n"				\
		     ".section .fixup,\"ax\"\n"				\
		     "3:  movl $-1,%[err]\n"				\
		     "3:  negl %%eax\n"					\
		     "    jmp  2b\n"					\
		     ".previous\n"					\
		     _ASM_EXTABLE(1b, 3b)				\
		     : [err] "=r" (err), output				\
		     _ASM_EXTABLE_FAULT(1b, 3b)				\
		     : [err] "=a" (err), output				\
		     : "0"(0), input);					\
	err;								\
})
@@ -143,12 +129,12 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu);
		     _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore)	\
		     : output : input)

static inline int copy_fregs_to_user(struct fregs_state __user *fx)
static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
{
	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
}

static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
{
	if (IS_ENABLED(CONFIG_X86_32))
		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
@@ -157,7 +143,7 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)

}

static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
static inline void fxrstor(struct fxregs_state *fx)
{
	if (IS_ENABLED(CONFIG_X86_32))
		kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -165,7 +151,7 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
		kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
static inline int fxrstor_safe(struct fxregs_state *fx)
{
	if (IS_ENABLED(CONFIG_X86_32))
		return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -173,7 +159,7 @@ static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
		return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
{
	if (IS_ENABLED(CONFIG_X86_32))
		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -181,29 +167,21 @@ static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline void copy_kernel_to_fregs(struct fregs_state *fx)
static inline void frstor(struct fregs_state *fx)
{
	kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_kernel_to_fregs_err(struct fregs_state *fx)
static inline int frstor_safe(struct fregs_state *fx)
{
	return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline int copy_user_to_fregs(struct fregs_state __user *fx)
static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
{
	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}

static inline void copy_fxregs_to_kernel(struct fpu *fpu)
{
	if (IS_ENABLED(CONFIG_X86_32))
		asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
	else
		asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
}

static inline void fxsave(struct fxregs_state *fx)
{
	if (IS_ENABLED(CONFIG_X86_32))
@@ -219,16 +197,20 @@ static inline void fxsave(struct fxregs_state *fx)
#define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"

/*
 * After this @err contains 0 on success or the negated trap number when
 * the operation raises an exception. For faults this results in -EFAULT.
 */
#define XSTATE_OP(op, st, lmask, hmask, err)				\
	asm volatile("1:" op "\n\t"					\
		     "xor %[err], %[err]\n"				\
		     "2:\n\t"						\
		     ".pushsection .fixup,\"ax\"\n\t"			\
		     "3: movl $-2,%[err]\n\t"				\
		     "3: negl %%eax\n\t"				\
		     "jmp 2b\n\t"					\
		     ".popsection\n\t"					\
		     _ASM_EXTABLE(1b, 3b)				\
		     : [err] "=r" (err)					\
		     _ASM_EXTABLE_FAULT(1b, 3b)				\
		     : [err] "=a" (err)					\
		     : "D" (st), "m" (*st), "a" (lmask), "d" (hmask)	\
		     : "memory")

@@ -280,9 +262,9 @@ static inline void fxsave(struct fxregs_state *fx)
 * This function is called only during boot time when x86 caps are not set
 * up and alternative can not be used yet.
 */
static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
static inline void os_xrstor_booting(struct xregs_state *xstate)
{
	u64 mask = -1;
	u64 mask = xfeatures_mask_fpstate();
	u32 lmask = mask;
	u32 hmask = mask >> 32;
	int err;
@@ -303,8 +285,11 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)

/*
 * Save processor xstate to xsave area.
 *
 * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
 * and command line options. The choice is permanent until the next reboot.
 */
static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
static inline void os_xsave(struct xregs_state *xstate)
{
	u64 mask = xfeatures_mask_all;
	u32 lmask = mask;
@@ -321,8 +306,10 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate)

/*
 * Restore processor xstate from xsave area.
 *
 * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
 */
static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
static inline void os_xrstor(struct xregs_state *xstate, u64 mask)
{
	u32 lmask = mask;
	u32 hmask = mask >> 32;
@@ -340,9 +327,14 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
 * backward compatibility for old applications which don't understand
 * compacted format of xsave area.
 */
static inline int copy_xregs_to_user(struct xregs_state __user *buf)
static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
{
	u64 mask = xfeatures_mask_user();
	/*
	 * Include the features which are not xsaved/rstored by the kernel
	 * internally, e.g. PKRU. That's user space ABI and also required
	 * to allow the signal handler to modify PKRU.
	 */
	u64 mask = xfeatures_mask_uabi();
	u32 lmask = mask;
	u32 hmask = mask >> 32;
	int err;
@@ -365,7 +357,7 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf)
/*
 * Restore xstate from user space xsave area.
 */
static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
{
	struct xregs_state *xstate = ((__force struct xregs_state *)buf);
	u32 lmask = mask;
@@ -383,13 +375,13 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
 * Restore xstate from kernel space xsave area, return an error code instead of
 * an exception.
 */
static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
{
	u32 lmask = mask;
	u32 hmask = mask >> 32;
	int err;

	if (static_cpu_has(X86_FEATURE_XSAVES))
	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
	else
		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
@@ -397,36 +389,11 @@ static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
	return err;
}

extern int copy_fpregs_to_fpstate(struct fpu *fpu);

static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
{
	if (use_xsave()) {
		copy_kernel_to_xregs(&fpstate->xsave, mask);
	} else {
		if (use_fxsr())
			copy_kernel_to_fxregs(&fpstate->fxsave);
		else
			copy_kernel_to_fregs(&fpstate->fsave);
	}
}
extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);

static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate)
{
	/*
	 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
	 * pending. Clear the x87 state here by setting it to fixed values.
	 * "m" is a random variable that should be in L1.
	 */
	if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
		asm volatile(
			"fnclex\n\t"
			"emms\n\t"
			"fildl %P[addr]"	/* set F?P to defined value */
			: : [addr] "m" (fpstate));
	}

	__copy_kernel_to_fpregs(fpstate, -1);
	__restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate());
}

extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
@@ -485,10 +452,8 @@ static inline void fpregs_activate(struct fpu *fpu)
	trace_x86_fpu_regs_activated(fpu);
}

/*
 * Internal helper, do not use directly. Use switch_fpu_return() instead.
 */
static inline void __fpregs_load_activate(void)
/* Internal helper for switch_fpu_return() and signal frame setup */
static inline void fpregs_restore_userregs(void)
{
	struct fpu *fpu = &current->thread.fpu;
	int cpu = smp_processor_id();
@@ -497,7 +462,21 @@ static inline void __fpregs_load_activate(void)
		return;

	if (!fpregs_state_valid(fpu, cpu)) {
		copy_kernel_to_fpregs(&fpu->state);
		u64 mask;

		/*
		 * This restores _all_ xstate which has not been
		 * established yet.
		 *
		 * If PKRU is enabled, then the PKRU value is already
		 * correct because it was either set in switch_to() or in
		 * flush_thread(). So it is excluded because it might be
		 * not up to date in current->thread.fpu.xsave state.
		 */
		mask = xfeatures_mask_restore_user() |
			xfeatures_mask_supervisor();
		__restore_fpregs_from_fpstate(&fpu->state, mask);

		fpregs_activate(fpu);
		fpu->last_cpu = cpu;
	}
@@ -529,12 +508,17 @@ static inline void __fpregs_load_activate(void)
static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
{
	if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) {
		if (!copy_fpregs_to_fpstate(old_fpu))
			old_fpu->last_cpu = -1;
		else
		save_fpregs_to_fpstate(old_fpu);
		/*
		 * The save operation preserved register state, so the
		 * fpu_fpregs_owner_ctx is still @old_fpu. Store the
		 * current CPU number in @old_fpu, so the next return
		 * to user space can avoid the FPU register restore
		 * when is returns on the same CPU and still owns the
		 * context.
		 */
		old_fpu->last_cpu = cpu;

		/* But leave fpu_fpregs_owner_ctx! */
		trace_x86_fpu_regs_deactivated(old_fpu);
	}
}
@@ -544,39 +528,13 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
 */

/*
 * Load PKRU from the FPU context if available. Delay loading of the
 * complete FPU state until the return to userland.
 * Delay loading of the complete FPU state until the return to userland.
 * PKRU is handled separately.
 */
static inline void switch_fpu_finish(struct fpu *new_fpu)
{
	u32 pkru_val = init_pkru_value;
	struct pkru_state *pk;

	if (!static_cpu_has(X86_FEATURE_FPU))
		return;

	if (cpu_feature_enabled(X86_FEATURE_FPU))
		set_thread_flag(TIF_NEED_FPU_LOAD);

	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
		return;

	/*
	 * PKRU state is switched eagerly because it needs to be valid before we
	 * return to userland e.g. for a copy_to_user() operation.
	 */
	if (!(current->flags & PF_KTHREAD)) {
		/*
		 * If the PKRU bit in xsave.header.xfeatures is not set,
		 * then the PKRU component was in init state, which means
		 * XRSTOR will set PKRU to 0. If the bit is not set then
		 * get_xsave_addr() will return NULL because the PKRU value
		 * in memory is not valid. This means pkru_val has to be
		 * set to 0 and not to init_pkru_value.
		 */
		pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU);
		pkru_val = pk ? pk->pkru : 0;
	}
	__write_pkru(pkru_val);
}

#endif /* _ASM_X86_FPU_INTERNAL_H */
Loading