Commit 0cd6fd52 authored by QIU ZHITENG's avatar QIU ZHITENG
Browse files

mm: Enhanced copy capabilities for Hygon processor

hygon inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/IBE9G7


CVE: NA

---------------------------

The following methods are used to improve the large memory copy
performance of the Hygon processor between kernel and user mode.

Prefetch is a technique for reading blocks of data from main memory at
very high data rates, then operating on them within the cache. Results
are then written out to memory, all with high efficiency.

The code can employ a very special instruction: NT. This is a streaming
store instruction for writing data to memory. This instruction bypasses
the on-chip cache and sends data directly into a write-combining buffer.
Because NT allows the CPU to avoid reading the old data from the memory
destination address, NT can effectively improve the total write bandwidth.
There are similar optimizations for reading data from memory.

Interruptions may occur when copying large memory, which may trigger
thread switching. You need to save the current MMX register context and
continue copying when switching back to the thread next time.

Signed-off-by: default avatarzhuchao <zhuchao@hygon.cn>
Signed-off-by: default avatarqiuzhiteng <qiuzhiteng@hygon.cn>
parent f5a62ec8
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -867,6 +867,7 @@ config ACRN_GUEST
endif #HYPERVISOR_GUEST

source "arch/x86/Kconfig.cpu"
source "arch/x86/Kconfig.fpu"

config HPET_TIMER
	def_bool X86_64

arch/x86/Kconfig.fpu

0 → 100644
+62 −0
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0

menuconfig USING_FPU_IN_KERNEL_NONATOMIC
	bool "Hygon large memory copy support"
	help
	  This option enables support for optimized large memory copy operations
	  on Hygon processors in the kernel space using SSE2 or AVX2 non-temporal (NT)
	  copy instructions. NT instructions are streaming store instructions that bypass
	  the on-chip cache and send data directly to a write-combining buffer.

	  When this option is enabled, you can choose the specific instruction set to use
	  for large memory copy: SSE2 or AVX2. Using these instruction sets can improve data
	  throughput and reduce the number of cache misses during memory copy operations.

if USING_FPU_IN_KERNEL_NONATOMIC

choice
	prompt "X86_HYGON_LMC"
	depends on X86_64 && CPU_SUP_HYGON
	default X86_HYGON_LMC_SSE2_ON
	help
	  Select the type of non-temporal (NT) copy instructions to use for
	  large memory copy operations between kernel and user mode. You can
	  choose between SSE2 or AVX2 instructions based on the processor
	  capabilities and the size of the memory being copied.

	  To use this feature, you also need to configure the data copy size.
	  The file is in `/sys/c86_features/hygon_c86/nt_cpy_mini_len`. Please
	  refer to configuration 4096 and above.

config X86_HYGON_LMC_SSE2_ON
	bool "Using sse2 nt copy for large memory copy"
	help
	  When this feature is enabled, the kernel will use the
	  copy_user_sse2_opt_string function for large memory copy operations.

	  SSE2 (Streaming SIMD Extensions 2) instructions support non-temporal
	  (NT) stores that bypass the CPU cache and write data directly to
	  memory. This can improve performance for large memory copies by reducing
	  cache pollution and taking advantage of the write-combining buffer.

	  However, using SSE2 NT copy may require saving and restoring MMX and
	  SSE2 register contexts during thread switching if an interruption occurs.

config X86_HYGON_LMC_AVX2_ON
	bool "Using avx2 nt copy for large memory copy"
	help
	  When this feature is enabled, the kernel will use the
	  copy_user_avx2_pf64_nt_string function for large memory copy operations.

	  AVX2 (Advanced Vector Extensions 2) instructions provide enhanced
	  vector processing capabilities and support for non-temporal (NT) stores,
	  which can significantly improve memory copy performance for large blocks
	  of data. By bypassing the cache and writing data directly to memory,
	  AVX2 NT copy can achieve higher throughput than SSE2 NT copy.

	  Similar to SSE2, using AVX2 NT copy may require saving and restoring
	  AVX2 register contexts if an interruption occurs during large memory
	  copying, to ensure the process continues smoothly after resuming.

endchoice
endif
+3 −0
Original line number Diff line number Diff line
@@ -383,6 +383,9 @@ CONFIG_CPU_SUP_AMD=y
CONFIG_CPU_SUP_HYGON=y
CONFIG_CPU_SUP_CENTAUR=y
CONFIG_CPU_SUP_ZHAOXIN=y
CONFIG_USING_FPU_IN_KERNEL_NONATOMIC=y
# CONFIG_X86_HYGON_LMC_SSE2_ON is not set
CONFIG_X86_HYGON_LMC_AVX2_ON=y
CONFIG_HPET_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
CONFIG_DMI=y
+26 −0
Original line number Diff line number Diff line
@@ -37,6 +37,32 @@ static inline void kernel_fpu_begin(void)
	kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
}

#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) || \
	defined(CONFIG_X86_HYGON_LMC_AVX2_ON)
extern int kernel_fpu_begin_nonatomic_mask(unsigned int kfpu_mask);
extern void kernel_fpu_end_nonatomic(void);

/* Code that is unaware of kernel_fpu_begin_nonatomic_mask() can use this */
static inline int kernel_fpu_begin_nonatomic(void)
{
	return kernel_fpu_begin_nonatomic_mask(KFPU_387 | KFPU_MXCSR);
}

/*
 * It means we call kernel_fpu_end after kernel_fpu_begin_nonatomic
 * func, but before kernel_fpu_end_nonatomic
 */
static inline void check_using_kernel_fpu(void)
{

	WARN_ON_ONCE(test_thread_flag(TIF_USING_FPU_NONATOMIC));
}

#else
static inline void check_using_kernel_fpu(void) { }

#endif

/*
 * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate.
 * A context switch will (and softirq might) save CPU's FPU registers to
+59 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * Kernel FPU state switching for scheduling.
 *
 * This is a two-stage process:
 *
 *  - switch_kernel_fpu_prepare() saves the old kernel fpu state.
 *    This is done within the context of the old process.
 *
 *  - switch_kernel_fpu_finish() restore new kernel fpu state.
 *
 * The kernel FPU context is only stored/restored for a user task in kernel
 * mode and PF_KTHREAD is used to distinguish between kernel and user threads.
 */
#if defined(CONFIG_X86_HYGON_LMC_SSE2_ON) ||                                   \
	defined(CONFIG_X86_HYGON_LMC_AVX2_ON)
extern void save_fpregs_to_fpkernelstate(struct fpu *kfpu);
extern unsigned long get_fpu_registers_pos(struct fpu *fpu, unsigned int off);
static inline void switch_kernel_fpu_prepare(struct task_struct *prev, int cpu)
{
	struct fpu *old_fpu = &prev->thread.fpu;

	if (!test_thread_flag(TIF_USING_FPU_NONATOMIC))
		return;

	if (static_cpu_has(X86_FEATURE_FPU) && !(prev->flags & PF_KTHREAD))
		save_fpregs_to_fpkernelstate(old_fpu);
}

/* Internal helper for switch_kernel_fpu_finish() and signal frame setup */
static inline void fpregs_restore_kernelregs(struct fpu *kfpu)
{
	kernel_fpu_states_restore(NULL, (void *)get_fpu_registers_pos(kfpu, MAX_FPU_CTX_SIZE),
						MAX_FPU_CTX_SIZE);
}

/* Loading of the complete FPU state immediately. */
static inline void switch_kernel_fpu_finish(struct task_struct *next)
{
	struct fpu *new_fpu = &next->thread.fpu;

	if (next->flags & PF_KTHREAD)
		return;

	if (cpu_feature_enabled(X86_FEATURE_FPU) &&
	    test_ti_thread_flag((struct thread_info *)next,
				TIF_USING_FPU_NONATOMIC))
		fpregs_restore_kernelregs(new_fpu);
}
#else
static inline void switch_kernel_fpu_prepare(struct task_struct *prev, int cpu)
{
}
static inline void switch_kernel_fpu_finish(struct task_struct *next)
{
}

#endif
Loading