Commit 8c69e7af authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'x86_alternatives_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 instruction alternatives updates from Borislav Petkov:

 - Up until now the Fast Short Rep Mov optimizations implied the
   presence of the ERMS CPUID flag. AMD decoupled them with a BIOS
   setting so decouple that dependency in the kernel code too

 - Teach the alternatives machinery to handle relocations

 - Make debug_alternative accept flags in order to see only that set of
   patching done one is interested in

 - Other fixes, cleanups and optimizations to the patching code

* tag 'x86_alternatives_for_v6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/alternative: PAUSE is not a NOP
  x86/alternatives: Add cond_resched() to text_poke_bp_batch()
  x86/nospec: Shorten RESET_CALL_DEPTH
  x86/alternatives: Add longer 64-bit NOPs
  x86/alternatives: Fix section mismatch warnings
  x86/alternative: Optimize returns patching
  x86/alternative: Complicate optimize_nops() some more
  x86/alternative: Rewrite optimize_nops() some
  x86/lib/memmove: Decouple ERMS from FSRM
  x86/alternative: Support relocations in alternatives
  x86/alternative: Make debug-alternative selective
parents aa35a483 2bd4aa93
Loading
Loading
Loading
Loading
+14 −2
Original line number Diff line number Diff line
@@ -34,6 +34,8 @@
#define BYTES_NOP7	0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
#define BYTES_NOP8	0x3e,BYTES_NOP7

#define ASM_NOP_MAX 8

#else

/*
@@ -47,6 +49,9 @@
 * 6: osp nopl 0x00(%eax,%eax,1)
 * 7: nopl 0x00000000(%eax)
 * 8: nopl 0x00000000(%eax,%eax,1)
 * 9: cs nopl 0x00000000(%eax,%eax,1)
 * 10: osp cs nopl 0x00000000(%eax,%eax,1)
 * 11: osp osp cs nopl 0x00000000(%eax,%eax,1)
 */
#define BYTES_NOP1	0x90
#define BYTES_NOP2	0x66,BYTES_NOP1
@@ -56,6 +61,15 @@
#define BYTES_NOP6	0x66,BYTES_NOP5
#define BYTES_NOP7	0x0f,0x1f,0x80,0x00,0x00,0x00,0x00
#define BYTES_NOP8	0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
#define BYTES_NOP9	0x2e,BYTES_NOP8
#define BYTES_NOP10	0x66,BYTES_NOP9
#define BYTES_NOP11	0x66,BYTES_NOP10

#define ASM_NOP9  _ASM_BYTES(BYTES_NOP9)
#define ASM_NOP10 _ASM_BYTES(BYTES_NOP10)
#define ASM_NOP11 _ASM_BYTES(BYTES_NOP11)

#define ASM_NOP_MAX 11

#endif /* CONFIG_64BIT */

@@ -68,8 +82,6 @@
#define ASM_NOP7 _ASM_BYTES(BYTES_NOP7)
#define ASM_NOP8 _ASM_BYTES(BYTES_NOP8)

#define ASM_NOP_MAX 8

#ifndef __ASSEMBLY__
extern const unsigned char * const x86_nops[];
#endif
+3 −3
Original line number Diff line number Diff line
@@ -84,12 +84,12 @@
	movq	$-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);

#define RESET_CALL_DEPTH					\
	mov	$0x80, %rax;					\
	shl	$56, %rax;					\
	xor	%eax, %eax;					\
	bts	$63, %rax;					\
	movq	%rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);

#define RESET_CALL_DEPTH_FROM_CALL				\
	mov	$0xfc, %rax;					\
	movb	$0xfc, %al;					\
	shl	$56, %rax;					\
	movq	%rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);	\
	CALL_THUNKS_DEBUG_INC_CALLS
+324 −146
Original line number Diff line number Diff line
@@ -37,11 +37,23 @@ EXPORT_SYMBOL_GPL(alternatives_patched);

#define MAX_PATCH_LEN (255-1)

static int __initdata_or_module debug_alternative;
#define DA_ALL		(~0)
#define DA_ALT		0x01
#define DA_RET		0x02
#define DA_RETPOLINE	0x04
#define DA_ENDBR	0x08
#define DA_SMP		0x10

static unsigned int __initdata_or_module debug_alternative;

static int __init debug_alt(char *str)
{
	debug_alternative = 1;
	if (str && *str == '=')
		str++;

	if (!str || kstrtouint(str, 0, &debug_alternative))
		debug_alternative = DA_ALL;

	return 1;
}
__setup("debug-alternative", debug_alt);
@@ -55,15 +67,15 @@ static int __init setup_noreplace_smp(char *str)
}
__setup("noreplace-smp", setup_noreplace_smp);

#define DPRINTK(fmt, args...)						\
#define DPRINTK(type, fmt, args...)					\
do {									\
	if (debug_alternative)						\
	if (debug_alternative & DA_##type)				\
		printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);		\
} while (0)

#define DUMP_BYTES(buf, len, fmt, args...)				\
#define DUMP_BYTES(type, buf, len, fmt, args...)			\
do {									\
	if (unlikely(debug_alternative)) {				\
	if (unlikely(debug_alternative & DA_##type)) {			\
		int j;							\
									\
		if (!(len))						\
@@ -86,6 +98,11 @@ static const unsigned char x86nops[] =
	BYTES_NOP6,
	BYTES_NOP7,
	BYTES_NOP8,
#ifdef CONFIG_64BIT
	BYTES_NOP9,
	BYTES_NOP10,
	BYTES_NOP11,
#endif
};

const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
@@ -99,19 +116,44 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
	x86nops + 1 + 2 + 3 + 4 + 5,
	x86nops + 1 + 2 + 3 + 4 + 5 + 6,
	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
#ifdef CONFIG_64BIT
	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
#endif
};

/* Use this to add nops to a buffer, then text_poke the whole buffer. */
static void __init_or_module add_nops(void *insns, unsigned int len)
/*
 * Fill the buffer with a single effective instruction of size @len.
 *
 * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
 * for every single-byte NOP, try to generate the maximally available NOP of
 * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
 * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
 * *jump* over instead of executing long and daft NOPs.
 */
static void __init_or_module add_nop(u8 *instr, unsigned int len)
{
	while (len > 0) {
		unsigned int noplen = len;
		if (noplen > ASM_NOP_MAX)
			noplen = ASM_NOP_MAX;
		memcpy(insns, x86_nops[noplen], noplen);
		insns += noplen;
		len -= noplen;
	u8 *target = instr + len;

	if (!len)
		return;

	if (len <= ASM_NOP_MAX) {
		memcpy(instr, x86_nops[len], len);
		return;
	}

	if (len < 128) {
		__text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
		instr += JMP8_INSN_SIZE;
	} else {
		__text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
		instr += JMP32_INSN_SIZE;
	}

	for (;instr < target; instr++)
		*instr = INT3_INSN_OPCODE;
}

extern s32 __retpoline_sites[], __retpoline_sites_end[];
@@ -123,133 +165,223 @@ extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);

/*
 * Are we looking at a near JMP with a 1 or 4-byte displacement.
 * Matches NOP and NOPL, not any of the other possible NOPs.
 */
static inline bool is_jmp(const u8 opcode)
static bool insn_is_nop(struct insn *insn)
{
	return opcode == 0xeb || opcode == 0xe9;
}
	/* Anything NOP, but no REP NOP */
	if (insn->opcode.bytes[0] == 0x90 &&
	    (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
		return true;

static void __init_or_module
recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
{
	u8 *next_rip, *tgt_rip;
	s32 n_dspl, o_dspl;
	int repl_len;
	/* NOPL */
	if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
		return true;

	if (a->replacementlen != 5)
		return;
	/* TODO: more nops */

	o_dspl = *(s32 *)(insn_buff + 1);
	return false;
}

	/* next_rip of the replacement JMP */
	next_rip = repl_insn + a->replacementlen;
	/* target rip of the replacement JMP */
	tgt_rip  = next_rip + o_dspl;
	n_dspl = tgt_rip - orig_insn;
/*
 * Find the offset of the first non-NOP instruction starting at @offset
 * but no further than @len.
 */
static int skip_nops(u8 *instr, int offset, int len)
{
	struct insn insn;

	DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
	for (; offset < len; offset += insn.length) {
		if (insn_decode_kernel(&insn, &instr[offset]))
			break;

	if (tgt_rip - orig_insn >= 0) {
		if (n_dspl - 2 <= 127)
			goto two_byte_jmp;
		else
			goto five_byte_jmp;
	/* negative offset */
	} else {
		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
			goto two_byte_jmp;
		else
			goto five_byte_jmp;
		if (!insn_is_nop(&insn))
			break;
	}

two_byte_jmp:
	n_dspl -= 2;

	insn_buff[0] = 0xeb;
	insn_buff[1] = (s8)n_dspl;
	add_nops(insn_buff + 2, 3);
	return offset;
}

	repl_len = 2;
	goto done;
/*
 * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
 * to the end of the NOP sequence into a single NOP.
 */
static bool __init_or_module
__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
{
	int i = *next - insn->length;

five_byte_jmp:
	n_dspl -= 5;
	switch (insn->opcode.bytes[0]) {
	case JMP8_INSN_OPCODE:
	case JMP32_INSN_OPCODE:
		*prev = i;
		*target = *next + insn->immediate.value;
		return false;
	}

	insn_buff[0] = 0xe9;
	*(s32 *)&insn_buff[1] = n_dspl;
	if (insn_is_nop(insn)) {
		int nop = i;

	repl_len = 5;
		*next = skip_nops(instr, *next, len);
		if (*target && *next == *target)
			nop = *prev;

done:
		add_nop(instr + nop, *next - nop);
		DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
		return true;
	}

	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
	*target = 0;
	return false;
}

/*
 * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
 *
 * @instr: instruction byte stream
 * @instrlen: length of the above
 * @off: offset within @instr where the first NOP has been detected
 *
 * Return: number of NOPs found (and replaced).
 * "noinline" to cause control flow change and thus invalidate I$ and
 * cause refetch after modification.
 */
static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
{
	unsigned long flags;
	int i = off, nnops;
	int prev, target = 0;

	while (i < instrlen) {
		if (instr[i] != 0x90)
			break;
	for (int next, i = 0; i < len; i = next) {
		struct insn insn;

		i++;
	}
		if (insn_decode_kernel(&insn, &instr[i]))
			return;

	nnops = i - off;
		next = i + insn.length;

	if (nnops <= 1)
		return nnops;
		__optimize_nops(instr, len, &insn, &next, &prev, &target);
	}
}

	local_irq_save(flags);
	add_nops(instr + off, nnops);
	local_irq_restore(flags);
/*
 * In this context, "source" is where the instructions are placed in the
 * section .altinstr_replacement, for example during kernel build by the
 * toolchain.
 * "Destination" is where the instructions are being patched in by this
 * machinery.
 *
 * The source offset is:
 *
 *   src_imm = target - src_next_ip                  (1)
 *
 * and the target offset is:
 *
 *   dst_imm = target - dst_next_ip                  (2)
 *
 * so rework (1) as an expression for target like:
 *
 *   target = src_imm + src_next_ip                  (1a)
 *
 * and substitute in (2) to get:
 *
 *   dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
 *
 * Now, since the instruction stream is 'identical' at src and dst (it
 * is being copied after all) it can be stated that:
 *
 *   src_next_ip = src + ip_offset
 *   dst_next_ip = dst + ip_offset                   (4)
 *
 * Substitute (4) in (3) and observe ip_offset being cancelled out to
 * obtain:
 *
 *   dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
 *           = src_imm + src - dst + ip_offset - ip_offset
 *           = src_imm + src - dst                   (5)
 *
 * IOW, only the relative displacement of the code block matters.
 */

#define apply_reloc_n(n_, p_, d_)				\
	do {							\
		s32 v = *(s##n_ *)(p_);				\
		v += (d_);					\
		BUG_ON((v >> 31) != (v >> (n_-1)));		\
		*(s##n_ *)(p_) = (s##n_)v;			\
	} while (0)

	DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);

	return nnops;
static __always_inline
void apply_reloc(int n, void *ptr, uintptr_t diff)
{
	switch (n) {
	case 1: apply_reloc_n(8, ptr, diff); break;
	case 2: apply_reloc_n(16, ptr, diff); break;
	case 4: apply_reloc_n(32, ptr, diff); break;
	default: BUG();
	}
}

static __always_inline
bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
{
	u8 *target = src + offset;
	/*
 * "noinline" to cause control flow change and thus invalidate I$ and
 * cause refetch after modification.
	 * If the target is inside the patched block, it's relative to the
	 * block itself and does not need relocation.
	 */
static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
	return (target < src || target > src + src_len);
}

static void __init_or_module noinline
apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
{
	int prev, target = 0;

	for (int next, i = 0; i < len; i = next) {
		struct insn insn;
	int i = 0;

	/*
	 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
	 * ones.
	 */
	for (;;) {
		if (insn_decode_kernel(&insn, &instr[i]))
		if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
			return;

		next = i + insn.length;

		if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
			continue;

		switch (insn.opcode.bytes[0]) {
		case 0x0f:
			if (insn.opcode.bytes[1] < 0x80 ||
			    insn.opcode.bytes[1] > 0x8f)
				break;

			fallthrough;	/* Jcc.d32 */
		case 0x70 ... 0x7f:	/* Jcc.d8 */
		case JMP8_INSN_OPCODE:
		case JMP32_INSN_OPCODE:
		case CALL_INSN_OPCODE:
			if (need_reloc(next + insn.immediate.value, src, src_len)) {
				apply_reloc(insn.immediate.nbytes,
					    buf + i + insn_offset_immediate(&insn),
					    src - dest);
			}

			/*
		 * See if this and any potentially following NOPs can be
		 * optimized.
			 * Where possible, convert JMP.d32 into JMP.d8.
			 */
		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
			i += optimize_nops_range(instr, len, i);
		else
			i += insn.length;
			if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
				s32 imm = insn.immediate.value;
				imm += src - dest;
				imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
				if ((imm >> 31) == (imm >> 7)) {
					buf[i+0] = JMP8_INSN_OPCODE;
					buf[i+1] = (s8)imm;

		if (i >= len)
			return;
					memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
				}
			}
			break;
		}

		if (insn_rip_relative(&insn)) {
			if (need_reloc(next + insn.displacement.value, src, src_len)) {
				apply_reloc(insn.displacement.nbytes,
					    buf + i + insn_offset_displacement(&insn),
					    src - dest);
			}
		}
	}
}

@@ -270,7 +402,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
	u8 *instr, *replacement;
	u8 insn_buff[MAX_PATCH_LEN];

	DPRINTK("alt table %px, -> %px", start, end);
	DPRINTK(ALT, "alt table %px, -> %px", start, end);
	/*
	 * The scan order should be from start to end. A later scanned
	 * alternative code can overwrite previously scanned alternative code.
@@ -294,47 +426,31 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
		 * - feature not present but ALT_FLAG_NOT is set to mean,
		 *   patch if feature is *NOT* present.
		 */
		if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT))
			goto next;
		if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
			optimize_nops(instr, a->instrlen);
			continue;
		}

		DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
		DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
			(a->flags & ALT_FLAG_NOT) ? "!" : "",
			a->cpuid >> 5,
			a->cpuid & 0x1f,
			instr, instr, a->instrlen,
			replacement, a->replacementlen);

		DUMP_BYTES(instr, a->instrlen, "%px:   old_insn: ", instr);
		DUMP_BYTES(replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);

		memcpy(insn_buff, replacement, a->replacementlen);
		insn_buff_sz = a->replacementlen;

		/*
		 * 0xe8 is a relative jump; fix the offset.
		 *
		 * Instruction length is checked before the opcode to avoid
		 * accessing uninitialized bytes for zero-length replacements.
		 */
		if (a->replacementlen == 5 && *insn_buff == 0xe8) {
			*(s32 *)(insn_buff + 1) += replacement - instr;
			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
				*(s32 *)(insn_buff + 1),
				(unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
		}

		if (a->replacementlen && is_jmp(replacement[0]))
			recompute_jump(a, instr, replacement, insn_buff);

		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
			insn_buff[insn_buff_sz] = 0x90;

		DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
		apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);

		text_poke_early(instr, insn_buff, insn_buff_sz);
		DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
		DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
		DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);

next:
		optimize_nops(instr, a->instrlen);
		text_poke_early(instr, insn_buff, insn_buff_sz);
	}
}

@@ -555,15 +671,15 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
			continue;
		}

		DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
		DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
			addr, addr, insn.length,
			addr + insn.length + insn.immediate.value);

		len = patch_retpoline(addr, &insn, bytes);
		if (len == insn.length) {
			optimize_nops(bytes, len);
			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
			DUMP_BYTES(RETPOLINE, ((u8*)addr),  len, "%px: orig: ", addr);
			DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
			text_poke_early(addr, bytes, len);
		}
	}
@@ -590,13 +706,12 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
{
	int i = 0;

	/* Patch the custom return thunks... */
	if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
		if (x86_return_thunk == __x86_return_thunk)
			return -1;

		i = JMP32_INSN_SIZE;
		__text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
	} else {
		/* ... or patch them out if not needed. */
		bytes[i++] = RET_INSN_OPCODE;
	}

@@ -609,6 +724,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
{
	s32 *s;

	/*
	 * Do not patch out the default return thunks if those needed are the
	 * ones generated by the compiler.
	 */
	if (cpu_feature_enabled(X86_FEATURE_RETHUNK) &&
	    (x86_return_thunk == __x86_return_thunk))
		return;

	for (s = start; s < end; s++) {
		void *dest = NULL, *addr = (void *)s + *s;
		struct insn insn;
@@ -630,14 +753,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
			      addr, dest, 5, addr))
			continue;

		DPRINTK("return thunk at: %pS (%px) len: %d to: %pS",
		DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
			addr, addr, insn.length,
			addr + insn.length + insn.immediate.value);

		len = patch_return(addr, &insn, bytes);
		if (len == insn.length) {
			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
			DUMP_BYTES(RET, ((u8*)addr),  len, "%px: orig: ", addr);
			DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
			text_poke_early(addr, bytes, len);
		}
	}
@@ -655,7 +778,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }

#ifdef CONFIG_X86_KERNEL_IBT

static void poison_endbr(void *addr, bool warn)
static void __init_or_module poison_endbr(void *addr, bool warn)
{
	u32 endbr, poison = gen_endbr_poison();

@@ -667,13 +790,13 @@ static void poison_endbr(void *addr, bool warn)
		return;
	}

	DPRINTK("ENDBR at: %pS (%px)", addr, addr);
	DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);

	/*
	 * When we have IBT, the lack of ENDBR will trigger #CP
	 */
	DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
	DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
	DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
	DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
	text_poke_early(addr, &poison, 4);
}

@@ -1148,7 +1271,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
	smp->locks_end	= locks_end;
	smp->text	= text;
	smp->text_end	= text_end;
	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
	DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
		smp->locks, smp->locks_end,
		smp->text, smp->text_end, smp->name);

@@ -1225,6 +1348,20 @@ int alternatives_text_reserved(void *start, void *end)
#endif /* CONFIG_SMP */

#ifdef CONFIG_PARAVIRT

/* Use this to add nops to a buffer, then text_poke the whole buffer. */
static void __init_or_module add_nops(void *insns, unsigned int len)
{
	while (len > 0) {
		unsigned int noplen = len;
		if (noplen > ASM_NOP_MAX)
			noplen = ASM_NOP_MAX;
		memcpy(insns, x86_nops[noplen], noplen);
		insns += noplen;
		len -= noplen;
	}
}

void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
				     struct paravirt_patch_site *end)
{
@@ -1332,6 +1469,35 @@ static noinline void __init int3_selftest(void)
	unregister_die_notifier(&int3_exception_nb);
}

static __initdata int __alt_reloc_selftest_addr;

__visible noinline void __init __alt_reloc_selftest(void *arg)
{
	WARN_ON(arg != &__alt_reloc_selftest_addr);
}

static noinline void __init alt_reloc_selftest(void)
{
	/*
	 * Tests apply_relocation().
	 *
	 * This has a relative immediate (CALL) in a place other than the first
	 * instruction and additionally on x86_64 we get a RIP-relative LEA:
	 *
	 *   lea    0x0(%rip),%rdi  # 5d0: R_X86_64_PC32    .init.data+0x5566c
	 *   call   +0              # 5d5: R_X86_64_PLT32   __alt_reloc_selftest-0x4
	 *
	 * Getting this wrong will either crash and burn or tickle the WARN
	 * above.
	 */
	asm_inline volatile (
		ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
		: /* output */
		: [mem] "m" (__alt_reloc_selftest_addr)
		: _ASM_ARG1
	);
}

void __init alternative_instructions(void)
{
	int3_selftest();
@@ -1419,6 +1585,8 @@ void __init alternative_instructions(void)

	restart_nmi();
	alternatives_patched = 1;

	alt_reloc_selftest();
}

/**
@@ -1953,6 +2121,16 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
	 */
	atomic_set_release(&bp_desc.refs, 1);

	/*
	 * Function tracing can enable thousands of places that need to be
	 * updated. This can take quite some time, and with full kernel debugging
	 * enabled, this could cause the softlockup watchdog to trigger.
	 * This function gets called every 256 entries added to be patched.
	 * Call cond_resched() here to make sure that other tasks can get scheduled
	 * while processing all the functions being patched.
	 */
	cond_resched();

	/*
	 * Corresponding read barrier in int3 notifier for making sure the
	 * nr_entries and handler are correctly ordered wrt. patching.
+5 −8
Original line number Diff line number Diff line
@@ -38,10 +38,12 @@ SYM_FUNC_START(__memmove)
	cmp %rdi, %r8
	jg 2f

	/* FSRM implies ERMS => no length checks, do the copy directly */
#define CHECK_LEN	cmp $0x20, %rdx; jb 1f
#define MEMMOVE_BYTES	movq %rdx, %rcx; rep movsb; RET
.Lmemmove_begin_forward:
	ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
	ALTERNATIVE "", "jmp .Lmemmove_erms", X86_FEATURE_ERMS
	ALTERNATIVE_2 __stringify(CHECK_LEN), \
		      __stringify(CHECK_LEN; MEMMOVE_BYTES), X86_FEATURE_ERMS, \
		      __stringify(MEMMOVE_BYTES), X86_FEATURE_FSRM

	/*
	 * movsq instruction have many startup latency
@@ -207,11 +209,6 @@ SYM_FUNC_START(__memmove)
	movb %r11b, (%rdi)
13:
	RET

.Lmemmove_erms:
	movq %rdx, %rcx
	rep movsb
	RET
SYM_FUNC_END(__memmove)
EXPORT_SYMBOL(__memmove)

+14 −2
Original line number Diff line number Diff line
@@ -34,6 +34,8 @@
#define BYTES_NOP7	0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
#define BYTES_NOP8	0x3e,BYTES_NOP7

#define ASM_NOP_MAX 8

#else

/*
@@ -47,6 +49,9 @@
 * 6: osp nopl 0x00(%eax,%eax,1)
 * 7: nopl 0x00000000(%eax)
 * 8: nopl 0x00000000(%eax,%eax,1)
 * 9: cs nopl 0x00000000(%eax,%eax,1)
 * 10: osp cs nopl 0x00000000(%eax,%eax,1)
 * 11: osp osp cs nopl 0x00000000(%eax,%eax,1)
 */
#define BYTES_NOP1	0x90
#define BYTES_NOP2	0x66,BYTES_NOP1
@@ -56,6 +61,15 @@
#define BYTES_NOP6	0x66,BYTES_NOP5
#define BYTES_NOP7	0x0f,0x1f,0x80,0x00,0x00,0x00,0x00
#define BYTES_NOP8	0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
#define BYTES_NOP9	0x2e,BYTES_NOP8
#define BYTES_NOP10	0x66,BYTES_NOP9
#define BYTES_NOP11	0x66,BYTES_NOP10

#define ASM_NOP9  _ASM_BYTES(BYTES_NOP9)
#define ASM_NOP10 _ASM_BYTES(BYTES_NOP10)
#define ASM_NOP11 _ASM_BYTES(BYTES_NOP11)

#define ASM_NOP_MAX 11

#endif /* CONFIG_64BIT */

@@ -68,8 +82,6 @@
#define ASM_NOP7 _ASM_BYTES(BYTES_NOP7)
#define ASM_NOP8 _ASM_BYTES(BYTES_NOP8)

#define ASM_NOP_MAX 8

#ifndef __ASSEMBLY__
extern const unsigned char * const x86_nops[];
#endif
Loading