Commit 023ef184 authored by Stuart Menefy's avatar Stuart Menefy Committed by Paul Mundt
Browse files

sh: __copy_user() optimizations for small copies.



This implements a fast-path for small (less than 12 bytes) copies,
with the existing path treated as the slow-path and left as the default
behaviour for all other copy sizes.

Signed-off-by: default avatarStuart Menefy <stuart.menefy@st.com>
Signed-off-by: default avatarPaul Mundt <lethal@linux-sh.org>
parent 24eb17e0
Loading
Loading
Loading
Loading
+108 −61
Original line number Diff line number Diff line
@@ -141,47 +141,38 @@ ENTRY(__copy_user_page)
	.long 9999b, 6000f	;	\
	.previous
ENTRY(__copy_user)
	tst	r6,r6		! Check explicitly for zero
	bf	1f
	rts
	 mov	#0,r0		! normal return
1:
	mov.l	r10,@-r15
	mov.l	r9,@-r15
	mov.l	r8,@-r15
	! Check if small number of bytes
	mov	#11,r0
	mov	r4,r3
	cmp/gt	r0,r6		! r6 (len) > r0 (11)
	bf/s	.L_cleanup_loop_no_pop
	 add	r6,r3		! last destination address
	mov	#12,r0		! Check if small number of bytes
	cmp/gt	r0,r6
	bt	2f
	bra	.L_cleanup_loop
	 nop
2:
	neg	r5,r0		! Calculate bytes needed to align source

	! Calculate bytes needed to align to src
	mov.l	r11,@-r15
	neg	r5,r0
	mov.l	r10,@-r15
	add	#4,r0
	mov.l	r9,@-r15
	and	#3,r0
	mov.l	r8,@-r15
	tst	r0,r0
	bt	.L_jump
	mov	r0,r1
	bt	2f

.L_loop1:
	! Copy bytes to align source
EX(	mov.b	@r5+,r0		)
	dt	r1
EX(	mov.b	r0,@r4		)
1:
	! Copy bytes to long word align src
EX(	mov.b	@r5+,r1		)
	dt	r0
	add	#-1,r6
	bf/s	.L_loop1
EX(	mov.b	r1,@r4		)
	bf/s	1b
	 add	#1,r4

.L_jump:
	mov	r6,r2		! Calculate number of longwords to copy
	! Jump to appropriate routine depending on dest
2:	mov	#3,r1
	mov	r6, r2
	and	r4,r1
	shlr2	r2
	tst	r2,r2
	bt	.L_cleanup

	mov	r4,r0		! Jump to appropriate routine
	and	#3,r0
	mov	r0,r1
	shll2	r1
	mova	.L_jump_tbl,r0
	mov.l	@(r0,r1),r1
@@ -195,43 +186,97 @@ EX( mov.b r0,@r4 )
	.long	.L_dest10
	.long	.L_dest11

/*
 * Come here if there are less than 12 bytes to copy
 *
 * Keep the branch target close, so the bf/s callee doesn't overflow
 * and result in a more expensive branch being inserted. This is the
 * fast-path for small copies, the jump via the jump table will hit the
 * default slow-path cleanup. -PFM.
 */
.L_cleanup_loop_no_pop:
	tst	r6,r6		! Check explicitly for zero
	bt	1f

2:
EX(	mov.b	@r5+,r0		)
	dt	r6
EX(	mov.b	r0,@r4		)
	bf/s	2b
	 add	#1,r4

1:	mov	#0,r0		! normal return
5000:

# Exception handler:
.section .fixup, "ax"
6000:
	mov.l	8000f,r1
	mov	r3,r0
	jmp	@r1
	 sub	r4,r0
	.align	2
8000:	.long	5000b

.previous
	rts
	 nop

! Destination = 00

.L_dest00:
	mov	r2,r7
	shlr2	r7
	shlr	r7
	tst	r7,r7
	mov	#7,r0
	bt/s	1f
	 and	r0,r2
	.align 2
	! Skip the large copy for small transfers
	mov	#(32+32-4), r0
	cmp/gt	r6, r0		! r0 (60) > r6 (len)
	bt	1f

	! Align dest to a 32 byte boundary
	neg	r4,r0
	add	#0x20, r0
	and	#0x1f, r0
	tst	r0, r0
	bt	2f

	sub	r0, r6
	shlr2	r0
3:
EX(	mov.l	@r5+,r1		)
	dt	r0
EX(	mov.l	r1,@r4		)
	bf/s	3b
	 add	#4,r4

2:
EX(	mov.l	@r5+,r0		)
EX(	mov.l	@r5+,r1		)
EX(	mov.l	@r5+,r2		)
EX(	mov.l	@r5+,r7		)
EX(	mov.l	@r5+,r8		)
EX(	mov.l	@r5+,r9		)
EX(	mov.l	@r5+,r10	)
EX(	mov.l	r0,@r4		)
EX(	mov.l	r8,@(4,r4)	)
EX(	mov.l	r9,@(8,r4)	)
EX(	mov.l	r10,@(12,r4)	)
EX(	mov.l	@r5+,r0		)
EX(	mov.l	@r5+,r8		)
EX(	mov.l	@r5+,r9		)
EX(	mov.l	@r5+,r10	)
	dt	r7
EX(	mov.l	r0,@(16,r4)	)
EX(	mov.l	r8,@(20,r4)	)
EX(	mov.l	r9,@(24,r4)	)
EX(	mov.l	r10,@(28,r4)	)
EX(	mov.l	@r5+,r11	)
EX(	movca.l	r0,@r4		)
	add	#-32, r6
EX(	mov.l	r1,@(4,r4)	)
	mov	#32, r0
EX(	mov.l	r2,@(8,r4)	)
	cmp/gt	r6, r0		! r0 (32) > r6 (len)
EX(	mov.l	r7,@(12,r4)	)
EX(	mov.l	r8,@(16,r4)	)
EX(	mov.l	r9,@(20,r4)	)
EX(	mov.l	r10,@(24,r4)	)
EX(	mov.l	r11,@(28,r4)	)
	bf/s	2b
	 add	#32,r4
	tst	r2,r2

1:	mov	r6, r0
	shlr2	r0
	tst	r0, r0
	bt	.L_cleanup
1:
EX(	mov.l	@r5+,r0		)
	dt	r2
EX(	mov.l	r0,@r4		)
EX(	mov.l	@r5+,r1		)
	dt	r0
EX(	mov.l	r1,@r4		)
	bf/s	1b
	 add	#4,r4

@@ -250,7 +295,7 @@ EX( mov.l r0,@r4 )
	 and	r0,r2
2:
	dt	r7
#ifdef __LITTLE_ENDIAN__
#ifdef CONFIG_CPU_LITTLE_ENDIAN
EX(	mov.l	@r5+,r0		)
EX(	mov.l	@r5+,r1		)
EX(	mov.l	@r5+,r8		)
@@ -320,7 +365,7 @@ EX( mov.w r0,@(2,r4) )
1:	! Read longword, write two words per iteration
EX(	mov.l	@r5+,r0		)
	dt	r2
#ifdef __LITTLE_ENDIAN__
#ifdef CONFIG_CPU_LITTLE_ENDIAN
EX(	mov.w	r0,@r4		)
	shlr16	r0
EX(	mov.w 	r0,@(2,r4)	)
@@ -342,7 +387,7 @@ EX( mov.w r0,@r4 )
	! Read longword, write byte, word, byte per iteration
EX(	mov.l	@r5+,r0		)
	dt	r2
#ifdef __LITTLE_ENDIAN__
#ifdef CONFIG_CPU_LITTLE_ENDIAN
EX(	mov.b	r0,@r4		)
	shlr8	r0
	add	#1,r4
@@ -379,6 +424,7 @@ EX( mov.b r0,@r4 )

.L_exit:
	mov	#0,r0		! normal return

5000:

# Exception handler:
@@ -394,5 +440,6 @@ EX( mov.b r0,@r4 )
.previous
	mov.l	@r15+,r8
	mov.l	@r15+,r9
	rts
	mov.l	@r15+,r10
	rts
	 mov.l	@r15+,r11