mirror of
https://github.com/torvalds/linux.git
synced 2024-11-14 16:12:02 +00:00
47ee3f1dd9
I tried to streamline our user memory copy code fairly aggressively in commitadfcf4231b
("x86: don't use REP_GOOD or ERMS for user memory copies"), in order to then be able to clean up the code and inline the modern FSRM case in commit577e6a7fd5
("x86: inline the 'rep movs' in user copies for the FSRM case"). We had reports [1] of that causing regressions earlier with blogbench, but that turned out to be a horrible benchmark for that case, and not a sufficient reason for re-instating "rep movsb" on older machines. However, now Eric Dumazet reported [2] a regression in performance that seems to be a rather more real benchmark, where due to the removal of "rep movs" a TCP stream over a 100Gbps network no longer reaches line speed. And it turns out that with the simplified the calling convention for the non-FSRM case in commit427fda2c8a
("x86: improve on the non-rep 'copy_user' function"), re-introducing the ERMS case is actually fairly simple. Of course, that "fairly simple" is glossing over several missteps due to having to fight our assembler alternative code. This code really wanted to rewrite a conditional branch to have two different targets, but that made objtool sufficiently unhappy that this instead just ended up doing a choice between "jump to the unrolled loop, or use 'rep movsb' directly". Let's see if somebody finds a case where the kernel memory copies also care (see commit68674f94ff
: "x86: don't use REP_GOOD or ERMS for small memory copies"). But Eric does argue that the user copies are special because networking tries to copy up to 32KB at a time, if order-3 pages allocations are possible. In-kernel memory copies are typically small, unless they are the special "copy pages at a time" kind that still use "rep movs". Link: https://lore.kernel.org/lkml/202305041446.71d46724-yujie.liu@intel.com/ [1] Link: https://lore.kernel.org/lkml/CANn89iKUbyrJ=r2+_kK+sb2ZSSHifFZ7QkPLDpAtkJ8v4WUumA@mail.gmail.com/ [2] Reported-and-tested-by: Eric Dumazet <edumazet@google.com> Fixes:adfcf4231b
("x86: don't use REP_GOOD or ERMS for user memory copies") Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
123 lines
2.7 KiB
ArmAsm
123 lines
2.7 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
|
|
* Copyright 2002 Andi Kleen, SuSE Labs.
|
|
*
|
|
* Functions to copy from and to user space.
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/cpufeatures.h>
|
|
#include <asm/alternative.h>
|
|
#include <asm/asm.h>
|
|
#include <asm/export.h>
|
|
|
|
/*
|
|
* rep_movs_alternative - memory copy with exception handling.
|
|
* This version is for CPUs that don't have FSRM (Fast Short Rep Movs)
|
|
*
|
|
* Input:
|
|
* rdi destination
|
|
* rsi source
|
|
* rcx count
|
|
*
|
|
* Output:
|
|
* rcx uncopied bytes or 0 if successful.
|
|
*
|
|
* NOTE! The calling convention is very intentionally the same as
|
|
* for 'rep movs', so that we can rewrite the function call with
|
|
* just a plain 'rep movs' on machines that have FSRM. But to make
|
|
* it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
|
|
*/
|
|
SYM_FUNC_START(rep_movs_alternative)
|
|
cmpq $64,%rcx
|
|
jae .Llarge
|
|
|
|
cmp $8,%ecx
|
|
jae .Lword
|
|
|
|
testl %ecx,%ecx
|
|
je .Lexit
|
|
|
|
.Lcopy_user_tail:
|
|
0: movb (%rsi),%al
|
|
1: movb %al,(%rdi)
|
|
inc %rdi
|
|
inc %rsi
|
|
dec %rcx
|
|
jne .Lcopy_user_tail
|
|
.Lexit:
|
|
RET
|
|
|
|
_ASM_EXTABLE_UA( 0b, .Lexit)
|
|
_ASM_EXTABLE_UA( 1b, .Lexit)
|
|
|
|
.p2align 4
|
|
.Lword:
|
|
2: movq (%rsi),%rax
|
|
3: movq %rax,(%rdi)
|
|
addq $8,%rsi
|
|
addq $8,%rdi
|
|
sub $8,%ecx
|
|
je .Lexit
|
|
cmp $8,%ecx
|
|
jae .Lword
|
|
jmp .Lcopy_user_tail
|
|
|
|
_ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
|
|
|
|
.Llarge:
|
|
0: ALTERNATIVE "jmp .Lunrolled", "rep movsb", X86_FEATURE_ERMS
|
|
1: RET
|
|
|
|
_ASM_EXTABLE_UA( 0b, 1b)
|
|
|
|
.p2align 4
|
|
.Lunrolled:
|
|
10: movq (%rsi),%r8
|
|
11: movq 8(%rsi),%r9
|
|
12: movq 16(%rsi),%r10
|
|
13: movq 24(%rsi),%r11
|
|
14: movq %r8,(%rdi)
|
|
15: movq %r9,8(%rdi)
|
|
16: movq %r10,16(%rdi)
|
|
17: movq %r11,24(%rdi)
|
|
20: movq 32(%rsi),%r8
|
|
21: movq 40(%rsi),%r9
|
|
22: movq 48(%rsi),%r10
|
|
23: movq 56(%rsi),%r11
|
|
24: movq %r8,32(%rdi)
|
|
25: movq %r9,40(%rdi)
|
|
26: movq %r10,48(%rdi)
|
|
27: movq %r11,56(%rdi)
|
|
addq $64,%rsi
|
|
addq $64,%rdi
|
|
subq $64,%rcx
|
|
cmpq $64,%rcx
|
|
jae .Lunrolled
|
|
cmpl $8,%ecx
|
|
jae .Lword
|
|
testl %ecx,%ecx
|
|
jne .Lcopy_user_tail
|
|
RET
|
|
|
|
_ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
|
|
_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
|
|
SYM_FUNC_END(rep_movs_alternative)
|
|
EXPORT_SYMBOL(rep_movs_alternative)
|