Commit 720dc7ab authored by Huacai Chen's avatar Huacai Chen
Browse files

LoongArch: Add subword xchg/cmpxchg emulation

LoongArch only support 32-bit/64-bit xchg/cmpxchg in native. But percpu
operation, qspinlock and some drivers need 8-bit/16-bit xchg/cmpxchg. We
add subword xchg/cmpxchg emulation in this patch because the emulation
has better performance than the generic implementation (on NUMA system),
and it can fix some build errors meanwhile [1].

LoongArch's guarantee for forward progress (avoid many ll/sc happening
at the same time and no one succeeds):

We have the "exclusive access (with timeout) of ll" feature to avoid
simultaneous ll (which also blocks other memory load/store on the same
address), and the "random delay of sc" feature to avoid simultaneous
sc. It is a mandatory requirement for multi-core LoongArch processors
to implement such features, only except those single-core and dual-core
processors (they also don't support multi-chip interconnection).

Feature bits are introduced in CPUCFG3, bit 3 and bit 4 [2].

[1] https://lore.kernel.org/loongarch/CAAhV-H6vvkuOzy8OemWdYK3taj5Jn3bFX0ZTwE=twM8ywpBUYA@mail.gmail.com/T/#t
[2] https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_cpucfg



Reported-by: default avatarSudip Mukherjee (Codethink) <sudipm.mukherjee@gmail.com>
Suggested-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: default avatarRui Wang <wangrui@loongson.cn>
Signed-off-by: default avatarHuacai Chen <chenhuacai@loongson.cn>
parent 092e9ebe
Loading
Loading
Loading
Loading
+97 −1
Original line number Diff line number Diff line
@@ -5,8 +5,9 @@
#ifndef __ASM_CMPXCHG_H
#define __ASM_CMPXCHG_H

#include <asm/barrier.h>
#include <linux/bits.h>
#include <linux/build_bug.h>
#include <asm/barrier.h>

#define __xchg_asm(amswap_db, m, val)		\
({						\
@@ -21,10 +22,53 @@
		__ret;				\
})

static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val,
					unsigned int size)
{
	unsigned int shift;
	u32 old32, mask, temp;
	volatile u32 *ptr32;

	/* Mask value to the correct size. */
	mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
	val &= mask;

	/*
	 * Calculate a shift & mask that correspond to the value we wish to
	 * exchange within the naturally aligned 4 byte integerthat includes
	 * it.
	 */
	shift = (unsigned long)ptr & 0x3;
	shift *= BITS_PER_BYTE;
	mask <<= shift;

	/*
	 * Calculate a pointer to the naturally aligned 4 byte integer that
	 * includes our byte of interest, and load its value.
	 */
	ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);

	asm volatile (
	"1:	ll.w		%0, %3		\n"
	"	andn		%1, %0, %z4	\n"
	"	or		%1, %1, %z5	\n"
	"	sc.w		%1, %2		\n"
	"	beqz		%1, 1b		\n"
	: "=&r" (old32), "=&r" (temp), "=ZC" (*ptr32)
	: "ZC" (*ptr32), "Jr" (mask), "Jr" (val << shift)
	: "memory");

	return (old32 & mask) >> shift;
}

static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
				   int size)
{
	switch (size) {
	case 1:
	case 2:
		return __xchg_small(ptr, x, size);

	case 4:
		return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x);

@@ -67,10 +111,62 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
	__ret;								\
})

static inline unsigned int __cmpxchg_small(volatile void *ptr, unsigned int old,
					   unsigned int new, unsigned int size)
{
	unsigned int shift;
	u32 old32, mask, temp;
	volatile u32 *ptr32;

	/* Mask inputs to the correct size. */
	mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
	old &= mask;
	new &= mask;

	/*
	 * Calculate a shift & mask that correspond to the value we wish to
	 * compare & exchange within the naturally aligned 4 byte integer
	 * that includes it.
	 */
	shift = (unsigned long)ptr & 0x3;
	shift *= BITS_PER_BYTE;
	old <<= shift;
	new <<= shift;
	mask <<= shift;

	/*
	 * Calculate a pointer to the naturally aligned 4 byte integer that
	 * includes our byte of interest, and load its value.
	 */
	ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);

	asm volatile (
	"1:	ll.w		%0, %3		\n"
	"	and		%1, %0, %z4	\n"
	"	bne		%1, %z5, 2f	\n"
	"	andn		%1, %0, %z4	\n"
	"	or		%1, %1, %z6	\n"
	"	sc.w		%1, %2		\n"
	"	beqz		%1, 1b		\n"
	"	b		3f		\n"
	"2:					\n"
	__WEAK_LLSC_MB
	"3:					\n"
	: "=&r" (old32), "=&r" (temp), "=ZC" (*ptr32)
	: "ZC" (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new)
	: "memory");

	return (old32 & mask) >> shift;
}

static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
				      unsigned long new, unsigned int size)
{
	switch (size) {
	case 1:
	case 2:
		return __cmpxchg_small(ptr, old, new, size);

	case 4:
		return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr,
				     (u32)old, new);
+8 −0
Original line number Diff line number Diff line
@@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
						int size)
{
	switch (size) {
	case 1:
	case 2:
		return __xchg_small((volatile void *)ptr, val, size);

	case 4:
		return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val);

@@ -204,9 +208,13 @@ do { \
#define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
#define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)

#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
#define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val)
#define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val)

#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
#define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
#define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)