Commit 8941e93c authored by WANG Rui's avatar WANG Rui Committed by Huacai Chen
Browse files

LoongArch: Optimize memory ops (memset/memcpy/memmove)

To optimize memset()/memcpy()/memmove() and so on, we use a jump table
to dispatch cases for short data lengths; and for long data lengths, we
split the destination into head part (first 8 bytes), tail part (last 8
bytes) and middle part. The head part and tail part may be at unaligned
addresses, while the middle part is always aligned (the middle part is
allowed to overlap the head/tail part). In this way, the first and last
8 bytes may be unaligned accesses, but we can make sure the data in the
middle is processed at an aligned destination address.

We have tested micro-bench[1] on a Loongson-3C5000 16-core machine (2.2GHz):

1. memset

| length | src offset | dst offset | speed before | speed after | %       |
|--------|------------|------------|--------------|-------------|---------|
| 8      | 0          | 0          | 696.191      | 1518.785    | 118.16% |
| 8      | 0          | 1          | 696.325      | 1518.937    | 118.14% |
| 50     | 0          | 0          | 969.976      | 8053.902    | 730.32% |
| 50     | 0          | 1          | 970.034      | 8058.475    | 730.74% |
| 300    | 0          | 0          | 5876.612     | 16544.703   | 181.53% |
| 300    | 0          | 1          | 5030.849     | 16549.011   | 228.95% |
| 1200   | 0          | 0          | 11797.077    | 16752.137   | 42.00%  |
| 1200   | 0          | 1          | 5687.141     | 16645.233   | 192.68% |
| 4000   | 0          | 0          | 15723.27     | 16761.557   | 6.60%   |
| 4000   | 0          | 1          | 5906.114     | 16732.316   | 183.30% |
| 8000   | 0          | 0          | 16751.403    | 16770.002   | 0.11%   |
| 8000   | 0          | 1          | 5995.449     | 16754.07    | 179.45% |

2. memcpy

| length | src offset | dst offset | speed before | speed after | %       |
|--------|------------|------------|--------------|-------------|---------|
| 8      | 0          | 0          | 696.2        | 1670.605    | 139.96% |
| 8      | 0          | 1          | 696.325      | 1671.138    | 139.99% |
| 50     | 0          | 0          | 969.974      | 8724.999    | 799.51% |
| 50     | 0          | 1          | 970.032      | 8730.138    | 799.98% |
| 300    | 0          | 0          | 5564.662     | 16272.652   | 192.43% |
| 300    | 0          | 1          | 4670.436     | 14972.842   | 220.59% |
| 1200   | 0          | 0          | 10740.23     | 16751.728   | 55.97%  |
| 1200   | 0          | 1          | 5027.741     | 14874.564   | 195.85% |
| 4000   | 0          | 0          | 15122.367    | 16737.642   | 10.68%  |
| 4000   | 0          | 1          | 5536.918     | 14890.397   | 168.93% |
| 8000   | 0          | 0          | 16505.453    | 16553.543   | 0.29%   |
| 8000   | 0          | 1          | 5821.619     | 14841.804   | 154.94% |

3. memmove

| length | src offset | dst offset | speed before | speed after | %       |
|--------|------------|------------|--------------|-------------|---------|
| 8      | 0          | 0          | 982.693      | 1670.568    | 70.00%  |
| 8      | 0          | 1          | 983.023      | 1671.174    | 70.00%  |
| 50     | 0          | 0          | 1230.87      | 8727.625    | 609.06% |
| 50     | 0          | 1          | 1232.515     | 8730.138    | 608.32% |
| 300    | 0          | 0          | 6490.375     | 16296.993   | 151.09% |
| 300    | 0          | 1          | 4282.687     | 14972.842   | 249.61% |
| 1200   | 0          | 0          | 11742.755    | 16752.546   | 42.66%  |
| 1200   | 0          | 1          | 5039.338     | 14872.951   | 195.14% |
| 4000   | 0          | 0          | 15467.786    | 16737.09    | 8.21%   |
| 4000   | 0          | 1          | 5009.905     | 14890.542   | 197.22% |
| 8000   | 0          | 0          | 16489.664    | 16553.273   | 0.39%   |
| 8000   | 0          | 1          | 5823.786     | 14858.646   | 155.14% |

* speed: MB/s
* length: byte

[1] https://github.com/heiher/mem-bench



Signed-off-by: default avatarWANG Rui <wangrui@loongson.cn>
Signed-off-by: default avatarHuacai Chen <chenhuacai@loongson.cn>
parent 2b3bd32e
Loading
Loading
Loading
Loading
+121 −15
Original line number Diff line number Diff line
@@ -13,7 +13,14 @@

.irp to, 0, 1, 2, 3, 4, 5, 6, 7
.L_fixup_handle_\to\():
	addi.d	a0, a1, (\to) * (-8)
	sub.d	a0, a2, a0
	addi.d	a0, a0, (\to) * (-8)
	jr	ra
.endr

.irp to, 0, 2, 4
.L_fixup_handle_s\to\():
	addi.d	a0, a1, -\to
	jr	ra
.endr

@@ -44,7 +51,7 @@ SYM_FUNC_START(__clear_user_generic)
2:	move	a0, a1
	jr	ra

	_asm_extable 1b, .L_fixup_handle_0
	_asm_extable 1b, .L_fixup_handle_s0
SYM_FUNC_END(__clear_user_generic)

/*
@@ -54,12 +61,21 @@ SYM_FUNC_END(__clear_user_generic)
 * a1: size
 */
SYM_FUNC_START(__clear_user_fast)
	beqz	a1, 10f
	sltui	t0, a1, 9
	bnez	t0, .Lsmall

	ori	a2, zero, 64
	blt	a1, a2, 9f
	add.d	a2, a0, a1
0:	st.d	zero, a0, 0

	/* align up address */
	addi.d	a0, a0, 8
	bstrins.d	a0, zero, 2, 0

	addi.d	a3, a2, -64
	bgeu	a0, a3, .Llt64

	/* set 64 bytes at a time */
.Lloop64:
1:	st.d	zero, a0, 0
2:	st.d	zero, a0, 8
3:	st.d	zero, a0, 16
@@ -68,24 +84,95 @@ SYM_FUNC_START(__clear_user_fast)
6:	st.d	zero, a0, 40
7:	st.d	zero, a0, 48
8:	st.d	zero, a0, 56

	addi.d	a0, a0, 64
	addi.d	a1, a1, -64
	bge	a1, a2, 1b

	beqz	a1, 10f
	bltu	a0, a3, .Lloop64

	/* set the remaining bytes */
9:	st.b	zero, a0, 0
	addi.d	a0, a0, 1
	addi.d	a1, a1, -1
	bgt	a1, zero, 9b
.Llt64:
	addi.d	a3, a2, -32
	bgeu	a0, a3, .Llt32
9:	st.d	zero, a0, 0
10:	st.d	zero, a0, 8
11:	st.d	zero, a0, 16
12:	st.d	zero, a0, 24
	addi.d	a0, a0, 32

.Llt32:
	addi.d	a3, a2, -16
	bgeu	a0, a3, .Llt16
13:	st.d	zero, a0, 0
14:	st.d	zero, a0, 8
	addi.d	a0, a0, 16

.Llt16:
	addi.d	a3, a2, -8
	bgeu	a0, a3, .Llt8
15:	st.d	zero, a0, 0

.Llt8:
16:	st.d	zero, a2, -8

	/* return */
10:	move	a0, a1
	move	a0, zero
	jr	ra

	.align	4
.Lsmall:
	pcaddi	t0, 4
	slli.d	a2, a1, 4
	add.d	t0, t0, a2
	jr	t0

	.align	4
	move	a0, zero
	jr	ra

	.align	4
17:	st.b	zero, a0, 0
	move	a0, zero
	jr	ra

	.align	4
18:	st.h	zero, a0, 0
	move	a0, zero
	jr	ra

	.align	4
19:	st.h	zero, a0, 0
20:	st.b	zero, a0, 2
	move	a0, zero
	jr	ra

	.align	4
21:	st.w	zero, a0, 0
	move	a0, zero
	jr	ra

	.align	4
22:	st.w	zero, a0, 0
23:	st.b	zero, a0, 4
	move	a0, zero
	jr	ra

	.align	4
24:	st.w	zero, a0, 0
25:	st.h	zero, a0, 4
	move	a0, zero
	jr	ra

	.align	4
26:	st.w	zero, a0, 0
27:	st.w	zero, a0, 3
	move	a0, zero
	jr	ra

	.align	4
28:	st.d	zero, a0, 0
	move	a0, zero
	jr	ra

	/* fixup and ex_table */
	_asm_extable 0b, .L_fixup_handle_0
	_asm_extable 1b, .L_fixup_handle_0
	_asm_extable 2b, .L_fixup_handle_1
	_asm_extable 3b, .L_fixup_handle_2
@@ -95,4 +182,23 @@ SYM_FUNC_START(__clear_user_fast)
	_asm_extable 7b, .L_fixup_handle_6
	_asm_extable 8b, .L_fixup_handle_7
	_asm_extable 9b, .L_fixup_handle_0
	_asm_extable 10b, .L_fixup_handle_1
	_asm_extable 11b, .L_fixup_handle_2
	_asm_extable 12b, .L_fixup_handle_3
	_asm_extable 13b, .L_fixup_handle_0
	_asm_extable 14b, .L_fixup_handle_1
	_asm_extable 15b, .L_fixup_handle_0
	_asm_extable 16b, .L_fixup_handle_1
	_asm_extable 17b, .L_fixup_handle_s0
	_asm_extable 18b, .L_fixup_handle_s0
	_asm_extable 19b, .L_fixup_handle_s0
	_asm_extable 20b, .L_fixup_handle_s2
	_asm_extable 21b, .L_fixup_handle_s0
	_asm_extable 22b, .L_fixup_handle_s0
	_asm_extable 23b, .L_fixup_handle_s4
	_asm_extable 24b, .L_fixup_handle_s0
	_asm_extable 25b, .L_fixup_handle_s4
	_asm_extable 26b, .L_fixup_handle_s0
	_asm_extable 27b, .L_fixup_handle_s4
	_asm_extable 28b, .L_fixup_handle_s0
SYM_FUNC_END(__clear_user_fast)
+201 −50
Original line number Diff line number Diff line
@@ -13,7 +13,14 @@

.irp to, 0, 1, 2, 3, 4, 5, 6, 7
.L_fixup_handle_\to\():
	addi.d	a0, a2, (\to) * (-8)
	sub.d	a0, a2, a0
	addi.d	a0, a0, (\to) * (-8)
	jr	ra
.endr

.irp to, 0, 2, 4
.L_fixup_handle_s\to\():
	addi.d	a0, a2, -\to
	jr	ra
.endr

@@ -47,8 +54,8 @@ SYM_FUNC_START(__copy_user_generic)
3:	move	a0, a2
	jr	ra

	_asm_extable 1b, .L_fixup_handle_0
	_asm_extable 2b, .L_fixup_handle_0
	_asm_extable 1b, .L_fixup_handle_s0
	_asm_extable 2b, .L_fixup_handle_s0
SYM_FUNC_END(__copy_user_generic)

/*
@@ -59,65 +66,209 @@ SYM_FUNC_END(__copy_user_generic)
 * a2: n
 */
SYM_FUNC_START(__copy_user_fast)
	beqz	a2, 19f
	sltui	t0, a2, 9
	bnez	t0, .Lsmall

	ori	a3, zero, 64
	blt	a2, a3, 17f
	add.d	a3, a1, a2
	add.d	a2, a0, a2
0:	ld.d	t0, a1, 0
1:	st.d	t0, a0, 0

	/* copy 64 bytes at a time */
1:	ld.d	t0, a1, 0
2:	ld.d	t1, a1, 8
3:	ld.d	t2, a1, 16
4:	ld.d	t3, a1, 24
5:	ld.d	t4, a1, 32
6:	ld.d	t5, a1, 40
7:	ld.d	t6, a1, 48
8:	ld.d	t7, a1, 56
9:	st.d	t0, a0, 0
10:	st.d	t1, a0, 8
11:	st.d	t2, a0, 16
12:	st.d	t3, a0, 24
13:	st.d	t4, a0, 32
14:	st.d	t5, a0, 40
15:	st.d	t6, a0, 48
16:	st.d	t7, a0, 56
	/* align up destination address */
	andi	t1, a0, 7
	sub.d	t0, zero, t1
	addi.d	t0, t0, 8
	add.d	a1, a1, t0
	add.d	a0, a0, t0

	addi.d	a0, a0, 64
	addi.d	a1, a1, 64
	addi.d	a2, a2, -64
	bge	a2, a3, 1b
	addi.d	a4, a3, -64
	bgeu	a1, a4, .Llt64

	beqz	a2, 19f
	/* copy 64 bytes at a time */
.Lloop64:
2:	ld.d	t0, a1, 0
3:	ld.d	t1, a1, 8
4:	ld.d	t2, a1, 16
5:	ld.d	t3, a1, 24
6:	ld.d	t4, a1, 32
7:	ld.d	t5, a1, 40
8:	ld.d	t6, a1, 48
9:	ld.d	t7, a1, 56
	addi.d	a1, a1, 64
10:	st.d	t0, a0, 0
11:	st.d	t1, a0, 8
12:	st.d	t2, a0, 16
13:	st.d	t3, a0, 24
14:	st.d	t4, a0, 32
15:	st.d	t5, a0, 40
16:	st.d	t6, a0, 48
17:	st.d	t7, a0, 56
	addi.d	a0, a0, 64
	bltu	a1, a4, .Lloop64

	/* copy the remaining bytes */
17:	ld.b	t0, a1, 0
18:	st.b	t0, a0, 0
	addi.d	a0, a0, 1
	addi.d	a1, a1, 1
	addi.d	a2, a2, -1
	bgt	a2, zero, 17b
.Llt64:
	addi.d	a4, a3, -32
	bgeu	a1, a4, .Llt32
18:	ld.d	t0, a1, 0
19:	ld.d	t1, a1, 8
20:	ld.d	t2, a1, 16
21:	ld.d	t3, a1, 24
	addi.d	a1, a1, 32
22:	st.d	t0, a0, 0
23:	st.d	t1, a0, 8
24:	st.d	t2, a0, 16
25:	st.d	t3, a0, 24
	addi.d	a0, a0, 32

.Llt32:
	addi.d	a4, a3, -16
	bgeu	a1, a4, .Llt16
26:	ld.d	t0, a1, 0
27:	ld.d	t1, a1, 8
	addi.d	a1, a1, 16
28:	st.d	t0, a0, 0
29:	st.d	t1, a0, 8
	addi.d	a0, a0, 16

.Llt16:
	addi.d	a4, a3, -8
	bgeu	a1, a4, .Llt8
30:	ld.d	t0, a1, 0
31:	st.d	t0, a0, 0

.Llt8:
32:	ld.d	t0, a3, -8
33:	st.d	t0, a2, -8

	/* return */
19:	move	a0, a2
	move	a0, zero
	jr	ra

	.align	5
.Lsmall:
	pcaddi	t0, 8
	slli.d	a3, a2, 5
	add.d	t0, t0, a3
	jr	t0

	.align	5
	move	a0, zero
	jr	ra

	.align	5
34:	ld.b	t0, a1, 0
35:	st.b	t0, a0, 0
	move	a0, zero
	jr	ra

	.align	5
36:	ld.h	t0, a1, 0
37:	st.h	t0, a0, 0
	move	a0, zero
	jr	ra

	.align	5
38:	ld.h	t0, a1, 0
39:	ld.b	t1, a1, 2
40:	st.h	t0, a0, 0
41:	st.b	t1, a0, 2
	move	a0, zero
	jr	ra

	.align	5
42:	ld.w	t0, a1, 0
43:	st.w	t0, a0, 0
	move	a0, zero
	jr	ra

	.align	5
44:	ld.w	t0, a1, 0
45:	ld.b	t1, a1, 4
46:	st.w	t0, a0, 0
47:	st.b	t1, a0, 4
	move	a0, zero
	jr	ra

	.align	5
48:	ld.w	t0, a1, 0
49:	ld.h	t1, a1, 4
50:	st.w	t0, a0, 0
51:	st.h	t1, a0, 4
	move	a0, zero
	jr	ra

	.align	5
52:	ld.w	t0, a1, 0
53:	ld.w	t1, a1, 3
54:	st.w	t0, a0, 0
55:	st.w	t1, a0, 3
	move	a0, zero
	jr	ra

	.align	5
56:	ld.d	t0, a1, 0
57:	st.d	t0, a0, 0
	move	a0, zero
	jr	ra

	/* fixup and ex_table */
	_asm_extable 0b, .L_fixup_handle_0
	_asm_extable 1b, .L_fixup_handle_0
	_asm_extable 2b, .L_fixup_handle_1
	_asm_extable 3b, .L_fixup_handle_2
	_asm_extable 4b, .L_fixup_handle_3
	_asm_extable 5b, .L_fixup_handle_4
	_asm_extable 6b, .L_fixup_handle_5
	_asm_extable 7b, .L_fixup_handle_6
	_asm_extable 8b, .L_fixup_handle_7
	_asm_extable 2b, .L_fixup_handle_0
	_asm_extable 3b, .L_fixup_handle_0
	_asm_extable 4b, .L_fixup_handle_0
	_asm_extable 5b, .L_fixup_handle_0
	_asm_extable 6b, .L_fixup_handle_0
	_asm_extable 7b, .L_fixup_handle_0
	_asm_extable 8b, .L_fixup_handle_0
	_asm_extable 9b, .L_fixup_handle_0
	_asm_extable 10b, .L_fixup_handle_1
	_asm_extable 11b, .L_fixup_handle_2
	_asm_extable 12b, .L_fixup_handle_3
	_asm_extable 13b, .L_fixup_handle_4
	_asm_extable 14b, .L_fixup_handle_5
	_asm_extable 15b, .L_fixup_handle_6
	_asm_extable 16b, .L_fixup_handle_7
	_asm_extable 17b, .L_fixup_handle_0
	_asm_extable 10b, .L_fixup_handle_0
	_asm_extable 11b, .L_fixup_handle_1
	_asm_extable 12b, .L_fixup_handle_2
	_asm_extable 13b, .L_fixup_handle_3
	_asm_extable 14b, .L_fixup_handle_4
	_asm_extable 15b, .L_fixup_handle_5
	_asm_extable 16b, .L_fixup_handle_6
	_asm_extable 17b, .L_fixup_handle_7
	_asm_extable 18b, .L_fixup_handle_0
	_asm_extable 19b, .L_fixup_handle_0
	_asm_extable 20b, .L_fixup_handle_0
	_asm_extable 21b, .L_fixup_handle_0
	_asm_extable 22b, .L_fixup_handle_0
	_asm_extable 23b, .L_fixup_handle_1
	_asm_extable 24b, .L_fixup_handle_2
	_asm_extable 25b, .L_fixup_handle_3
	_asm_extable 26b, .L_fixup_handle_0
	_asm_extable 27b, .L_fixup_handle_0
	_asm_extable 28b, .L_fixup_handle_0
	_asm_extable 29b, .L_fixup_handle_1
	_asm_extable 30b, .L_fixup_handle_0
	_asm_extable 31b, .L_fixup_handle_0
	_asm_extable 32b, .L_fixup_handle_0
	_asm_extable 33b, .L_fixup_handle_1
	_asm_extable 34b, .L_fixup_handle_s0
	_asm_extable 35b, .L_fixup_handle_s0
	_asm_extable 36b, .L_fixup_handle_s0
	_asm_extable 37b, .L_fixup_handle_s0
	_asm_extable 38b, .L_fixup_handle_s0
	_asm_extable 39b, .L_fixup_handle_s0
	_asm_extable 40b, .L_fixup_handle_s0
	_asm_extable 41b, .L_fixup_handle_s2
	_asm_extable 42b, .L_fixup_handle_s0
	_asm_extable 43b, .L_fixup_handle_s0
	_asm_extable 44b, .L_fixup_handle_s0
	_asm_extable 45b, .L_fixup_handle_s0
	_asm_extable 46b, .L_fixup_handle_s0
	_asm_extable 47b, .L_fixup_handle_s4
	_asm_extable 48b, .L_fixup_handle_s0
	_asm_extable 49b, .L_fixup_handle_s0
	_asm_extable 50b, .L_fixup_handle_s0
	_asm_extable 51b, .L_fixup_handle_s4
	_asm_extable 52b, .L_fixup_handle_s0
	_asm_extable 53b, .L_fixup_handle_s0
	_asm_extable 54b, .L_fixup_handle_s0
	_asm_extable 55b, .L_fixup_handle_s4
	_asm_extable 56b, .L_fixup_handle_s0
	_asm_extable 57b, .L_fixup_handle_s0
SYM_FUNC_END(__copy_user_fast)
+121 −26
Original line number Diff line number Diff line
@@ -44,6 +44,66 @@ SYM_FUNC_START(__memcpy_generic)
SYM_FUNC_END(__memcpy_generic)
_ASM_NOKPROBE(__memcpy_generic)

	.align	5
SYM_FUNC_START_NOALIGN(__memcpy_small)
	pcaddi	t0, 8
	slli.d	a2, a2, 5
	add.d	t0, t0, a2
	jr	t0

	.align	5
0:	jr	ra

	.align	5
1:	ld.b	t0, a1, 0
	st.b	t0, a0, 0
	jr	ra

	.align	5
2:	ld.h	t0, a1, 0
	st.h	t0, a0, 0
	jr	ra

	.align	5
3:	ld.h	t0, a1, 0
	ld.b	t1, a1, 2
	st.h	t0, a0, 0
	st.b	t1, a0, 2
	jr	ra

	.align	5
4:	ld.w	t0, a1, 0
	st.w	t0, a0, 0
	jr	ra

	.align	5
5:	ld.w	t0, a1, 0
	ld.b	t1, a1, 4
	st.w	t0, a0, 0
	st.b	t1, a0, 4
	jr	ra

	.align	5
6:	ld.w	t0, a1, 0
	ld.h	t1, a1, 4
	st.w	t0, a0, 0
	st.h	t1, a0, 4
	jr	ra

	.align	5
7:	ld.w	t0, a1, 0
	ld.w	t1, a1, 3
	st.w	t0, a0, 0
	st.w	t1, a0, 3
	jr	ra

	.align	5
8:	ld.d	t0, a1, 0
	st.d	t0, a0, 0
	jr	ra
SYM_FUNC_END(__memcpy_small)
_ASM_NOKPROBE(__memcpy_small)

/*
 * void *__memcpy_fast(void *dst, const void *src, size_t n)
 *
@@ -52,14 +112,27 @@ _ASM_NOKPROBE(__memcpy_generic)
 * a2: n
 */
SYM_FUNC_START(__memcpy_fast)
	move	a3, a0
	beqz	a2, 3f
	sltui	t0, a2, 9
	bnez	t0, __memcpy_small

	add.d	a3, a1, a2
	add.d	a2, a0, a2
	ld.d	a6, a1, 0
	ld.d	a7, a3, -8

	/* align up destination address */
	andi	t1, a0, 7
	sub.d	t0, zero, t1
	addi.d	t0, t0, 8
	add.d	a1, a1, t0
	add.d	a5, a0, t0

	ori	a4, zero, 64
	blt	a2, a4, 2f
	addi.d	a4, a3, -64
	bgeu	a1, a4, .Llt64

	/* copy 64 bytes at a time */
1:	ld.d	t0, a1, 0
.Lloop64:
	ld.d	t0, a1, 0
	ld.d	t1, a1, 8
	ld.d	t2, a1, 16
	ld.d	t3, a1, 24
@@ -67,32 +140,54 @@ SYM_FUNC_START(__memcpy_fast)
	ld.d	t5, a1, 40
	ld.d	t6, a1, 48
	ld.d	t7, a1, 56
	st.d	t0, a0, 0
	st.d	t1, a0, 8
	st.d	t2, a0, 16
	st.d	t3, a0, 24
	st.d	t4, a0, 32
	st.d	t5, a0, 40
	st.d	t6, a0, 48
	st.d	t7, a0, 56

	addi.d	a0, a0, 64
	addi.d	a1, a1, 64
	addi.d	a2, a2, -64
	bge	a2, a4, 1b

	beqz	a2, 3f
	st.d	t0, a5, 0
	st.d	t1, a5, 8
	st.d	t2, a5, 16
	st.d	t3, a5, 24
	st.d	t4, a5, 32
	st.d	t5, a5, 40
	st.d	t6, a5, 48
	st.d	t7, a5, 56
	addi.d	a5, a5, 64
	bltu	a1, a4, .Lloop64

	/* copy the remaining bytes */
2:	ld.b	t0, a1, 0
	st.b	t0, a0, 0
	addi.d	a0, a0, 1
	addi.d	a1, a1, 1
	addi.d	a2, a2, -1
	bgt	a2, zero, 2b
.Llt64:
	addi.d	a4, a3, -32
	bgeu	a1, a4, .Llt32
	ld.d	t0, a1, 0
	ld.d	t1, a1, 8
	ld.d	t2, a1, 16
	ld.d	t3, a1, 24
	addi.d	a1, a1, 32
	st.d	t0, a5, 0
	st.d	t1, a5, 8
	st.d	t2, a5, 16
	st.d	t3, a5, 24
	addi.d	a5, a5, 32

.Llt32:
	addi.d	a4, a3, -16
	bgeu	a1, a4, .Llt16
	ld.d	t0, a1, 0
	ld.d	t1, a1, 8
	addi.d	a1, a1, 16
	st.d	t0, a5, 0
	st.d	t1, a5, 8
	addi.d	a5, a5, 16

.Llt16:
	addi.d	a4, a3, -8
	bgeu	a1, a4, .Llt8
	ld.d	t0, a1, 0
	st.d	t0, a5, 0

.Llt8:
	st.d	a6, a0, 0
	st.d	a7, a2, -8

	/* return */
3:	move	a0, a3
	jr	ra
SYM_FUNC_END(__memcpy_fast)
_ASM_NOKPROBE(__memcpy_fast)
+68 −52
Original line number Diff line number Diff line
@@ -11,23 +11,9 @@
#include <asm/regdef.h>

SYM_FUNC_START(memmove)
	blt	a0, a1, 1f	/* dst < src, memcpy */
	blt	a1, a0, 3f	/* src < dst, rmemcpy */
	blt	a0, a1, memcpy	/* dst < src, memcpy */
	blt	a1, a0, rmemcpy	/* src < dst, rmemcpy */
	jr	ra		/* dst == src, return */

	/* if (src - dst) < 64, copy 1 byte at a time */
1:	ori	a3, zero, 64
	sub.d	t0, a1, a0
	blt	t0, a3, 2f
	b	memcpy
2:	b	__memcpy_generic

	/* if (dst - src) < 64, copy 1 byte at a time */
3:	ori	a3, zero, 64
	sub.d	t0, a0, a1
	blt	t0, a3, 4f
	b	rmemcpy
4:	b	__rmemcpy_generic
SYM_FUNC_END(memmove)
_ASM_NOKPROBE(memmove)

@@ -76,50 +62,80 @@ _ASM_NOKPROBE(__rmemcpy_generic)
 * a2: n
 */
SYM_FUNC_START(__rmemcpy_fast)
	move	a3, a0
	beqz	a2, 3f
	sltui	t0, a2, 9
	bnez	t0, __memcpy_small

	add.d	a0, a0, a2
	add.d	a1, a1, a2
	add.d	a3, a1, a2
	add.d	a2, a0, a2
	ld.d	a6, a1, 0
	ld.d	a7, a3, -8

	/* align up destination address */
	andi	t1, a2, 7
	sub.d	a3, a3, t1
	sub.d	a5, a2, t1

	ori	a4, zero, 64
	blt	a2, a4, 2f
	addi.d	a4, a1, 64
	bgeu	a4, a3, .Llt64

	/* copy 64 bytes at a time */
1:	ld.d	t0, a1, -8
	ld.d	t1, a1, -16
	ld.d	t2, a1, -24
	ld.d	t3, a1, -32
	ld.d	t4, a1, -40
	ld.d	t5, a1, -48
	ld.d	t6, a1, -56
	ld.d	t7, a1, -64
	st.d	t0, a0, -8
	st.d	t1, a0, -16
	st.d	t2, a0, -24
	st.d	t3, a0, -32
	st.d	t4, a0, -40
	st.d	t5, a0, -48
	st.d	t6, a0, -56
	st.d	t7, a0, -64

	addi.d	a0, a0, -64
	addi.d	a1, a1, -64
	addi.d	a2, a2, -64
	bge	a2, a4, 1b

	beqz	a2, 3f
.Lloop64:
	ld.d	t0, a3, -8
	ld.d	t1, a3, -16
	ld.d	t2, a3, -24
	ld.d	t3, a3, -32
	ld.d	t4, a3, -40
	ld.d	t5, a3, -48
	ld.d	t6, a3, -56
	ld.d	t7, a3, -64
	addi.d	a3, a3, -64
	st.d	t0, a5, -8
	st.d	t1, a5, -16
	st.d	t2, a5, -24
	st.d	t3, a5, -32
	st.d	t4, a5, -40
	st.d	t5, a5, -48
	st.d	t6, a5, -56
	st.d	t7, a5, -64
	addi.d	a5, a5, -64
	bltu	a4, a3, .Lloop64

	/* copy the remaining bytes */
2:	ld.b	t0, a1, -1
	st.b	t0, a0, -1
	addi.d	a0, a0, -1
	addi.d	a1, a1, -1
	addi.d	a2, a2, -1
	bgt	a2, zero, 2b
.Llt64:
	addi.d	a4, a1, 32
	bgeu	a4, a3, .Llt32
	ld.d	t0, a3, -8
	ld.d	t1, a3, -16
	ld.d	t2, a3, -24
	ld.d	t3, a3, -32
	addi.d	a3, a3, -32
	st.d	t0, a5, -8
	st.d	t1, a5, -16
	st.d	t2, a5, -24
	st.d	t3, a5, -32
	addi.d	a5, a5, -32

.Llt32:
	addi.d	a4, a1, 16
	bgeu	a4, a3, .Llt16
	ld.d	t0, a3, -8
	ld.d	t1, a3, -16
	addi.d	a3, a3, -16
	st.d	t0, a5, -8
	st.d	t1, a5, -16
	addi.d	a5, a5, -16

.Llt16:
	addi.d	a4, a1, 8
	bgeu	a4, a3, .Llt8
	ld.d	t0, a3, -8
	st.d	t0, a5, -8

.Llt8:
	st.d	a6, a0, 0
	st.d	a7, a2, -8

	/* return */
3:	move	a0, a3
	jr	ra
SYM_FUNC_END(__rmemcpy_fast)
_ASM_NOKPROBE(__rmemcpy_fast)
+92 −24
Original line number Diff line number Diff line
@@ -56,39 +56,107 @@ _ASM_NOKPROBE(__memset_generic)
 * a2: n
 */
SYM_FUNC_START(__memset_fast)
	move	a3, a0
	beqz	a2, 3f

	ori	a4, zero, 64
	blt	a2, a4, 2f

	/* fill a1 to 64 bits */
	fill_to_64 a1

	/* set 64 bytes at a time */
1:	st.d	a1, a0, 0
	st.d	a1, a0, 8
	st.d	a1, a0, 16
	st.d	a1, a0, 24
	st.d	a1, a0, 32
	st.d	a1, a0, 40
	st.d	a1, a0, 48
	st.d	a1, a0, 56
	sltui	t0, a2, 9
	bnez	t0, .Lsmall

	addi.d	a0, a0, 64
	addi.d	a2, a2, -64
	bge	a2, a4, 1b
	add.d	a2, a0, a2
	st.d	a1, a0, 0

	beqz	a2, 3f
	/* align up address */
	addi.d	a3, a0, 8
	bstrins.d	a3, zero, 2, 0

	addi.d	a4, a2, -64
	bgeu	a3, a4, .Llt64

	/* set 64 bytes at a time */
.Lloop64:
	st.d	a1, a3, 0
	st.d	a1, a3, 8
	st.d	a1, a3, 16
	st.d	a1, a3, 24
	st.d	a1, a3, 32
	st.d	a1, a3, 40
	st.d	a1, a3, 48
	st.d	a1, a3, 56
	addi.d	a3, a3, 64
	bltu	a3, a4, .Lloop64

	/* set the remaining bytes */
2:	st.b	a1, a0, 0
	addi.d	a0, a0, 1
	addi.d	a2, a2, -1
	bgt	a2, zero, 2b
.Llt64:
	addi.d	a4, a2, -32
	bgeu	a3, a4, .Llt32
	st.d	a1, a3, 0
	st.d	a1, a3, 8
	st.d	a1, a3, 16
	st.d	a1, a3, 24
	addi.d	a3, a3, 32

.Llt32:
	addi.d	a4, a2, -16
	bgeu	a3, a4, .Llt16
	st.d	a1, a3, 0
	st.d	a1, a3, 8
	addi.d	a3, a3, 16

.Llt16:
	addi.d	a4, a2, -8
	bgeu	a3, a4, .Llt8
	st.d	a1, a3, 0

.Llt8:
	st.d	a1, a2, -8

	/* return */
3:	move	a0, a3
	jr	ra

	.align	4
.Lsmall:
	pcaddi	t0, 4
	slli.d	a2, a2, 4
	add.d	t0, t0, a2
	jr	t0

	.align	4
0:	jr	ra

	.align	4
1:	st.b	a1, a0, 0
	jr	ra

	.align	4
2:	st.h	a1, a0, 0
	jr	ra

	.align	4
3:	st.h	a1, a0, 0
	st.b	a1, a0, 2
	jr	ra

	.align	4
4:	st.w	a1, a0, 0
	jr	ra

	.align	4
5:	st.w	a1, a0, 0
	st.b	a1, a0, 4
	jr	ra

	.align	4
6:	st.w	a1, a0, 0
	st.h	a1, a0, 4
	jr	ra

	.align	4
7:	st.w	a1, a0, 0
	st.w	a1, a0, 3
	jr	ra

	.align	4
8:	st.d	a1, a0, 0
	jr	ra
SYM_FUNC_END(__memset_fast)
_ASM_NOKPROBE(__memset_fast)