From 68674f94ffc9dddc45e7733963ecc35c5eda9efd Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 15 Apr 2023 11:47:06 -0700
Subject: [PATCH 01/11] x86: don't use REP_GOOD or ERMS for small memory copies

The modern target to use is FSRM (Fast Short REP MOVS), and the other
cases should only be used for bigger areas (ie mainly things like page
copying and clearing).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/lib/memcpy_64.S | 34 ++++++++++------------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index a64017602010..8f95fb267caa 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -10,13 +10,6 @@
 
 .section .noinstr.text, "ax"
 
-/*
- * We build a jump to memcpy_orig by default which gets NOPped out on
- * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
- * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
- * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
- */
-
 /*
  * memcpy - Copy a memory block.
  *
@@ -27,17 +20,21 @@
  *
  * Output:
  * rax original destination
+ *
+ * The FSRM alternative should be done inline (avoiding the call and
+ * the disgusting return handling), but that would require some help
+ * from the compiler for better calling conventions.
+ *
+ * The 'rep movsb' itself is small enough to replace the call, but the
+ * two register moves blow up the code. And one of them is "needed"
+ * only for the return value that is the same as the source input,
+ * which the compiler could/should do much better anyway.
  */
 SYM_TYPED_FUNC_START(__memcpy)
-	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
-		      "jmp memcpy_erms", X86_FEATURE_ERMS
+	ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
 
 	movq %rdi, %rax
 	movq %rdx, %rcx
-	shrq $3, %rcx
-	andl $7, %edx
-	rep movsq
-	movl %edx, %ecx
 	rep movsb
 	RET
 SYM_FUNC_END(__memcpy)
@@ -46,17 +43,6 @@ EXPORT_SYMBOL(__memcpy)
 SYM_FUNC_ALIAS(memcpy, __memcpy)
 EXPORT_SYMBOL(memcpy)
 
-/*
- * memcpy_erms() - enhanced fast string memcpy. This is faster and
- * simpler than memcpy. Use memcpy_erms when possible.
- */
-SYM_FUNC_START_LOCAL(memcpy_erms)
-	movq %rdi, %rax
-	movq %rdx, %rcx
-	rep movsb
-	RET
-SYM_FUNC_END(memcpy_erms)
-
 SYM_FUNC_START_LOCAL(memcpy_orig)
 	movq %rdi, %rax
 

From 20f3337d350c4e1b4ac66d731fd4e98565bf6cc0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 15 Apr 2023 12:01:14 -0700
Subject: [PATCH 02/11] x86: don't use REP_GOOD or ERMS for small memory
 clearing

The modern target to use is FSRS (Fast Short REP STOS), and the other
cases should only be used for bigger areas (ie mainly things like page
clearing).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/lib/memset_64.S | 47 ++++++++++------------------------------
 1 file changed, 11 insertions(+), 36 deletions(-)

diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 6143b1a6fa2c..7c59a704c458 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -18,27 +18,22 @@
  * rdx   count (bytes)
  *
  * rax   original destination
+ *
+ * The FSRS alternative should be done inline (avoiding the call and
+ * the disgusting return handling), but that would require some help
+ * from the compiler for better calling conventions.
+ *
+ * The 'rep stosb' itself is small enough to replace the call, but all
+ * the register moves blow up the code. And two of them are "needed"
+ * only for the return value that is the same as the source input,
+ * which the compiler could/should do much better anyway.
  */
 SYM_FUNC_START(__memset)
-	/*
-	 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
-	 * to use it when possible. If not available, use fast string instructions.
-	 *
-	 * Otherwise, use original memset function.
-	 */
-	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
-		      "jmp memset_erms", X86_FEATURE_ERMS
+	ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
 
 	movq %rdi,%r9
+	movb %sil,%al
 	movq %rdx,%rcx
-	andl $7,%edx
-	shrq $3,%rcx
-	/* expand byte value  */
-	movzbl %sil,%esi
-	movabs $0x0101010101010101,%rax
-	imulq %rsi,%rax
-	rep stosq
-	movl %edx,%ecx
 	rep stosb
 	movq %r9,%rax
 	RET
@@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset)
 SYM_FUNC_ALIAS(memset, __memset)
 EXPORT_SYMBOL(memset)
 
-/*
- * ISO C memset - set a memory block to a byte value. This function uses
- * enhanced rep stosb to override the fast string function.
- * The code is simpler and shorter than the fast string function as well.
- *
- * rdi   destination
- * rsi   value (char)
- * rdx   count (bytes)
- *
- * rax   original destination
- */
-SYM_FUNC_START_LOCAL(memset_erms)
-	movq %rdi,%r9
-	movb %sil,%al
-	movq %rdx,%rcx
-	rep stosb
-	movq %r9,%rax
-	RET
-SYM_FUNC_END(memset_erms)
-
 SYM_FUNC_START_LOCAL(memset_orig)
 	movq %rdi,%r10
 

From adfcf4231b8cbc2d9c1e7bfaa965b907e60639eb Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 15 Apr 2023 13:14:59 -0700
Subject: [PATCH 03/11] x86: don't use REP_GOOD or ERMS for user memory copies

The modern target to use is FSRM (Fast Short REP MOVS), and the other
cases should only be used for bigger areas (ie mainly things like page
clearing).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_64.h | 15 +++------
 arch/x86/lib/copy_user_64.S       | 53 +++++--------------------------
 2 files changed, 13 insertions(+), 55 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index d13d71af5cf6..c697cf10b7c8 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -18,9 +18,7 @@
 
 /* Handles exceptions in both to and from, but doesn't do access_ok */
 __must_check unsigned long
-copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
-__must_check unsigned long
-copy_user_generic_string(void *to, const void *from, unsigned len);
+copy_user_fast_string(void *to, const void *from, unsigned len);
 __must_check unsigned long
 copy_user_generic_unrolled(void *to, const void *from, unsigned len);
 
@@ -30,15 +28,12 @@ copy_user_generic(void *to, const void *from, unsigned len)
 	unsigned ret;
 
 	/*
-	 * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
-	 * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
+	 * If CPU has FSRM feature, use 'rep movs'.
 	 * Otherwise, use copy_user_generic_unrolled.
 	 */
-	alternative_call_2(copy_user_generic_unrolled,
-			 copy_user_generic_string,
-			 X86_FEATURE_REP_GOOD,
-			 copy_user_enhanced_fast_string,
-			 X86_FEATURE_ERMS,
+	alternative_call(copy_user_generic_unrolled,
+			 copy_user_fast_string,
+			 X86_FEATURE_FSRM,
 			 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
 				     "=d" (len)),
 			 "1" (to), "2" (from), "3" (len)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 9dec1b38a98f..d0283bc7567d 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -104,8 +104,8 @@ SYM_FUNC_START(copy_user_generic_unrolled)
 SYM_FUNC_END(copy_user_generic_unrolled)
 EXPORT_SYMBOL(copy_user_generic_unrolled)
 
-/* Some CPUs run faster using the string copy instructions.
- * This is also a lot simpler. Use them when possible.
+/*
+ * Some CPUs support FSRM for Fast Short REP MOVS.
  *
  * Only 4GB of copy is supported. This shouldn't be a problem
  * because the kernel normally only writes from/to page sized chunks
@@ -122,58 +122,21 @@ EXPORT_SYMBOL(copy_user_generic_unrolled)
  * Output:
  * eax uncopied bytes or 0 if successful.
  */
-SYM_FUNC_START(copy_user_generic_string)
+SYM_FUNC_START(copy_user_fast_string)
 	ASM_STAC
-	cmpl $8,%edx
-	jb 2f		/* less than 8 bytes, go to byte copy loop */
-	ALIGN_DESTINATION
-	movl %edx,%ecx
-	shrl $3,%ecx
-	andl $7,%edx
-1:	rep movsq
-2:	movl %edx,%ecx
-3:	rep movsb
-	xorl %eax,%eax
-	ASM_CLAC
-	RET
-
-11:	leal (%rdx,%rcx,8),%ecx
-12:	movl %ecx,%edx		/* ecx is zerorest also */
-	jmp .Lcopy_user_handle_tail
-
-	_ASM_EXTABLE_CPY(1b, 11b)
-	_ASM_EXTABLE_CPY(3b, 12b)
-SYM_FUNC_END(copy_user_generic_string)
-EXPORT_SYMBOL(copy_user_generic_string)
-
-/*
- * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
- * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_FUNC_START(copy_user_enhanced_fast_string)
-	ASM_STAC
-	/* CPUs without FSRM should avoid rep movsb for short copies */
-	ALTERNATIVE "cmpl $64, %edx; jb copy_user_short_string", "", X86_FEATURE_FSRM
 	movl %edx,%ecx
 1:	rep movsb
 	xorl %eax,%eax
 	ASM_CLAC
 	RET
 
-12:	movl %ecx,%edx		/* ecx is zerorest also */
-	jmp .Lcopy_user_handle_tail
+12:	movl %ecx,%eax		/* ecx is zerorest also */
+	ASM_CLAC
+	RET
 
 	_ASM_EXTABLE_CPY(1b, 12b)
-SYM_FUNC_END(copy_user_enhanced_fast_string)
-EXPORT_SYMBOL(copy_user_enhanced_fast_string)
+SYM_FUNC_END(copy_user_fast_string)
+EXPORT_SYMBOL(copy_user_fast_string)
 
 /*
  * Try to copy last bytes and clear the rest if needed.

From d2c95f9d6802cc518d71d9795f4d9da54fb4e24d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 15 Apr 2023 13:22:31 -0700
Subject: [PATCH 04/11] x86: don't use REP_GOOD or ERMS for user memory
 clearing

The modern target to use is FSRS (Fast Short REP STOS), and the other
cases should only be used for bigger areas (ie mainly things like page
clearing).

Note! This changes the conditional for the inlining from FSRM ("fast
short rep movs") to FSRS ("fast short rep stos").

We'll have a separate fixup for AMD microarchitectures that have a good
'rep stosb' yet do not set the new Intel-specific FSRS bit (because FSRM
was there first).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_64.h | 14 ++----
 arch/x86/lib/clear_page_64.S      | 75 -------------------------------
 tools/objtool/check.c             |  2 -
 3 files changed, 3 insertions(+), 88 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index c697cf10b7c8..59ea54af505e 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -81,10 +81,6 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
 
 __must_check unsigned long
 clear_user_original(void __user *addr, unsigned long len);
-__must_check unsigned long
-clear_user_rep_good(void __user *addr, unsigned long len);
-__must_check unsigned long
-clear_user_erms(void __user *addr, unsigned long len);
 
 static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
 {
@@ -97,16 +93,12 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
 	 */
 	asm volatile(
 		"1:\n\t"
-		ALTERNATIVE_3("rep stosb",
-			      "call clear_user_erms",	  ALT_NOT(X86_FEATURE_FSRM),
-			      "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS),
-			      "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD))
+		ALTERNATIVE("rep stosb",
+			    "call clear_user_original", ALT_NOT(X86_FEATURE_FSRS))
 		"2:\n"
 	       _ASM_EXTABLE_UA(1b, 2b)
 	       : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
-	       : "a" (0)
-		/* rep_good clobbers %rdx */
-	       : "rdx");
+	       : "a" (0));
 
 	clac();
 
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index ecbfb4dd3b01..fcd01b9f8d50 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -113,78 +113,3 @@ SYM_FUNC_START(clear_user_original)
         _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
 SYM_FUNC_END(clear_user_original)
 EXPORT_SYMBOL(clear_user_original)
-
-/*
- * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
- * present.
- * Input:
- * rdi destination
- * rcx count
- *
- * Output:
- * rcx: uncleared bytes or 0 if successful.
- */
-SYM_FUNC_START(clear_user_rep_good)
-	# call the original thing for less than a cacheline
-	cmp $64, %rcx
-	jb clear_user_original
-
-.Lprep:
-	# copy lower 32-bits for rest bytes
-	mov %ecx, %edx
-	shr $3, %rcx
-	jz .Lrep_good_rest_bytes
-
-.Lrep_good_qwords:
-	rep stosq
-
-.Lrep_good_rest_bytes:
-	and $7, %edx
-	jz .Lrep_good_exit
-
-.Lrep_good_bytes:
-	mov %edx, %ecx
-	rep stosb
-
-.Lrep_good_exit:
-	# see .Lexit comment above
-	xor %eax, %eax
-	RET
-
-.Lrep_good_qwords_exception:
-	# convert remaining qwords back into bytes to return to caller
-	shl $3, %rcx
-	and $7, %edx
-	add %rdx, %rcx
-	jmp .Lrep_good_exit
-
-	_ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception)
-	_ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit)
-SYM_FUNC_END(clear_user_rep_good)
-EXPORT_SYMBOL(clear_user_rep_good)
-
-/*
- * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present.
- * Input:
- * rdi destination
- * rcx count
- *
- * Output:
- * rcx: uncleared bytes or 0 if successful.
- *
- */
-SYM_FUNC_START(clear_user_erms)
-	# call the original thing for less than a cacheline
-	cmp $64, %rcx
-	jb clear_user_original
-
-.Lerms_bytes:
-	rep stosb
-
-.Lerms_exit:
-	xorl %eax,%eax
-	RET
-
-	_ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit)
-SYM_FUNC_END(clear_user_erms)
-EXPORT_SYMBOL(clear_user_erms)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index f937be1afe65..4907545d3ce3 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1284,8 +1284,6 @@ static const char *uaccess_safe_builtin[] = {
 	"copy_mc_fragile_handle_tail",
 	"copy_mc_enhanced_fast_string",
 	"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
-	"clear_user_erms",
-	"clear_user_rep_good",
 	"clear_user_original",
 	NULL
 };

From 3639a535587d7aac449cdce9710dfdc97a3c8c8e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 15 Apr 2023 13:39:15 -0700
Subject: [PATCH 05/11] x86: move stac/clac from user copy routines into
 callers

This is preparatory work for inlining the 'rep movs' case, but also a
cleanup.  The __copy_user_nocache() function was mis-used by the rdma
code to do uncached kernel copies that don't actually want user copies
at all, and as a result doesn't want the stac/clac either.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_64.h |  8 +++++++-
 arch/x86/lib/copy_user_64.S       | 10 ----------
 arch/x86/lib/usercopy_64.c        |  6 +++++-
 tools/objtool/check.c             |  3 +++
 4 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 59ea54af505e..339883729065 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -27,6 +27,7 @@ copy_user_generic(void *to, const void *from, unsigned len)
 {
 	unsigned ret;
 
+	stac();
 	/*
 	 * If CPU has FSRM feature, use 'rep movs'.
 	 * Otherwise, use copy_user_generic_unrolled.
@@ -38,6 +39,7 @@ copy_user_generic(void *to, const void *from, unsigned len)
 				     "=d" (len)),
 			 "1" (to), "2" (from), "3" (len)
 			 : "memory", "rcx", "r8", "r9", "r10", "r11");
+	clac();
 	return ret;
 }
 
@@ -64,8 +66,12 @@ static inline int
 __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 				  unsigned size)
 {
+	long ret;
 	kasan_check_write(dst, size);
-	return __copy_user_nocache(dst, src, size, 0);
+	stac();
+	ret = __copy_user_nocache(dst, src, size, 0);
+	clac();
+	return ret;
 }
 
 static inline int
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index d0283bc7567d..818f2f728294 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -51,7 +51,6 @@
  * eax uncopied bytes or 0 if successful.
  */
 SYM_FUNC_START(copy_user_generic_unrolled)
-	ASM_STAC
 	cmpl $8,%edx
 	jb .Lcopy_user_short_string_bytes
 	ALIGN_DESTINATION
@@ -123,15 +122,12 @@ EXPORT_SYMBOL(copy_user_generic_unrolled)
  * eax uncopied bytes or 0 if successful.
  */
 SYM_FUNC_START(copy_user_fast_string)
-	ASM_STAC
 	movl %edx,%ecx
 1:	rep movsb
 	xorl %eax,%eax
-	ASM_CLAC
 	RET
 
 12:	movl %ecx,%eax		/* ecx is zerorest also */
-	ASM_CLAC
 	RET
 
 	_ASM_EXTABLE_CPY(1b, 12b)
@@ -160,12 +156,10 @@ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
 	movl %edx,%ecx
 1:	rep movsb
 2:	mov %ecx,%eax
-	ASM_CLAC
 	RET
 
 3:
 	movl %edx,%eax
-	ASM_CLAC
 	RET
 
 	_ASM_EXTABLE_CPY(1b, 2b)
@@ -209,7 +203,6 @@ SYM_CODE_START_LOCAL(copy_user_short_string)
 	decl %ecx
 	jnz 21b
 23:	xor %eax,%eax
-	ASM_CLAC
 	RET
 
 40:	leal (%rdx,%rcx,8),%edx
@@ -233,8 +226,6 @@ SYM_CODE_END(copy_user_short_string)
  *  - Require 4-byte alignment when size is 4 bytes.
  */
 SYM_FUNC_START(__copy_user_nocache)
-	ASM_STAC
-
 	/* If size is less than 8 bytes, go to 4-byte copy */
 	cmpl $8,%edx
 	jb .L_4b_nocache_copy_entry
@@ -327,7 +318,6 @@ SYM_FUNC_START(__copy_user_nocache)
 	/* Finished copying; fence the prior stores */
 .L_finish_copy:
 	xorl %eax,%eax
-	ASM_CLAC
 	sfence
 	RET
 
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 6c1f8ac5e721..15704c605a2b 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -45,7 +45,11 @@ EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
 long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
 {
 	unsigned long flushed, dest = (unsigned long) dst;
-	long rc = __copy_user_nocache(dst, src, size, 0);
+	long rc;
+
+	stac();
+	rc = __copy_user_nocache(dst, src, size, 0);
+	clac();
 
 	/*
 	 * __copy_user_nocache() uses non-temporal stores for the bulk
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 4907545d3ce3..ea62a4e03f1e 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1285,6 +1285,9 @@ static const char *uaccess_safe_builtin[] = {
 	"copy_mc_enhanced_fast_string",
 	"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
 	"clear_user_original",
+	"copy_user_generic_unrolled",
+	"copy_user_fast_string",
+	"__copy_user_nocache",
 	NULL
 };
 

From 577e6a7fd50d519c201d20968b6a027a6563dc4c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 15 Apr 2023 19:31:34 -0700
Subject: [PATCH 06/11] x86: inline the 'rep movs' in user copies for the FSRM
 case

This does the same thing for the user copies as commit 0db7058e8e23
("x86/clear_user: Make it faster") did for clear_user().  In other
words, it inlines the "rep movs" case when X86_FEATURE_FSRM is set,
avoiding the function call entirely.

In order to do that, it makes the calling convention for the out-of-line
case ("copy_user_generic_unrolled") match the 'rep movs' calling
convention, although it does also end up clobbering a number of
additional registers.

Also, to simplify code sharing in the low-level assembly with the
__copy_user_nocache() function (that uses the normal C calling
convention), we end up with a kind of mixed return value for the
low-level asm code: it will return the result in both %rcx (to work as
an alternative for the 'rep movs' case), _and_ in %rax (for the nocache
case).

We could avoid this by wrapping __copy_user_nocache() callers in an
inline asm, but since the cost is just an extra register copy, it's
probably not worth it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_64.h | 23 ++++++-------
 arch/x86/lib/copy_user_64.S       | 55 ++++++++++++-------------------
 tools/objtool/check.c             |  1 -
 3 files changed, 31 insertions(+), 48 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 339883729065..8cc918acbabc 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -18,29 +18,26 @@
 
 /* Handles exceptions in both to and from, but doesn't do access_ok */
 __must_check unsigned long
-copy_user_fast_string(void *to, const void *from, unsigned len);
-__must_check unsigned long
 copy_user_generic_unrolled(void *to, const void *from, unsigned len);
 
 static __always_inline __must_check unsigned long
-copy_user_generic(void *to, const void *from, unsigned len)
+copy_user_generic(void *to, const void *from, unsigned long len)
 {
-	unsigned ret;
-
 	stac();
 	/*
 	 * If CPU has FSRM feature, use 'rep movs'.
 	 * Otherwise, use copy_user_generic_unrolled.
 	 */
-	alternative_call(copy_user_generic_unrolled,
-			 copy_user_fast_string,
-			 X86_FEATURE_FSRM,
-			 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
-				     "=d" (len)),
-			 "1" (to), "2" (from), "3" (len)
-			 : "memory", "rcx", "r8", "r9", "r10", "r11");
+	asm volatile(
+		"1:\n\t"
+		ALTERNATIVE("rep movsb",
+			    "call copy_user_generic_unrolled", ALT_NOT(X86_FEATURE_FSRM))
+		"2:\n"
+		_ASM_EXTABLE_UA(1b, 2b)
+		:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
+		: : "memory", "rax", "rdx", "r8", "r9", "r10", "r11");
 	clac();
-	return ret;
+	return len;
 }
 
 static __always_inline __must_check unsigned long
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 818f2f728294..16a743f11b11 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -45,13 +45,29 @@
  * Input:
  * rdi destination
  * rsi source
- * rdx count
+ * rcx count
  *
  * Output:
- * eax uncopied bytes or 0 if successful.
+ * rcx uncopied bytes or 0 if successful.
+ *
+ * NOTE! The calling convention is very intentionally the same as
+ * for 'rep movs', so that we can rewrite the function call with
+ * just a plain 'rep movs' on machines that have FSRM.
+ *
+ * HOWEVER! This function ends up having a lot of the code common
+ * with __copy_user_nocache(), which is a normal C function, and
+ * has a similar calling convention, but gets the 'count' in %rdx,
+ * and returns the result in %rax.
+ *
+ * To share as much code as possible, we end up returning the
+ * result in *both* %rcx/%rax, and we also move the initial count
+ * into %rdx.
+ *
+ * We can clobber rdx/rsi/rdi and r8-r11
  */
 SYM_FUNC_START(copy_user_generic_unrolled)
-	cmpl $8,%edx
+	movl %ecx,%edx
+	cmpl $8,%ecx
 	jb .Lcopy_user_short_string_bytes
 	ALIGN_DESTINATION
 	movl %edx,%ecx
@@ -103,37 +119,6 @@ SYM_FUNC_START(copy_user_generic_unrolled)
 SYM_FUNC_END(copy_user_generic_unrolled)
 EXPORT_SYMBOL(copy_user_generic_unrolled)
 
-/*
- * Some CPUs support FSRM for Fast Short REP MOVS.
- *
- * Only 4GB of copy is supported. This shouldn't be a problem
- * because the kernel normally only writes from/to page sized chunks
- * even if user space passed a longer buffer.
- * And more would be dangerous because both Intel and AMD have
- * errata with rep movsq > 4GB. If someone feels the need to fix
- * this please consider this.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_FUNC_START(copy_user_fast_string)
-	movl %edx,%ecx
-1:	rep movsb
-	xorl %eax,%eax
-	RET
-
-12:	movl %ecx,%eax		/* ecx is zerorest also */
-	RET
-
-	_ASM_EXTABLE_CPY(1b, 12b)
-SYM_FUNC_END(copy_user_fast_string)
-EXPORT_SYMBOL(copy_user_fast_string)
-
 /*
  * Try to copy last bytes and clear the rest if needed.
  * Since protection fault in copy_from/to_user is not a normal situation,
@@ -160,6 +145,7 @@ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
 
 3:
 	movl %edx,%eax
+	movl %edx,%ecx
 	RET
 
 	_ASM_EXTABLE_CPY(1b, 2b)
@@ -203,6 +189,7 @@ SYM_CODE_START_LOCAL(copy_user_short_string)
 	decl %ecx
 	jnz 21b
 23:	xor %eax,%eax
+	xor %ecx,%ecx
 	RET
 
 40:	leal (%rdx,%rcx,8),%edx
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index ea62a4e03f1e..44817bbe48fe 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1286,7 +1286,6 @@ static const char *uaccess_safe_builtin[] = {
 	"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
 	"clear_user_original",
 	"copy_user_generic_unrolled",
-	"copy_user_fast_string",
 	"__copy_user_nocache",
 	NULL
 };

From 8c9b6a88b7e2f33c656cd667a081bfd4dc8f5005 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 16 Apr 2023 14:06:58 -0700
Subject: [PATCH 07/11] x86: improve on the non-rep 'clear_user' function

The old version was oddly written to have the repeat count in multiple
registers.  So instead of taking advantage of %rax being zero, it had
some sub-counts in it.  All just for a "single word clearing" loop,
which isn't even efficient to begin with.

So get rid of those games, and just keep all the state in the same
registers we got it in (and that we should return things in).  That not
only makes this act much more like 'rep stos' (which this function is
replacing), but makes it much easier to actually do the obvious loop
unrolling.

Also rename the function from the now nonsensical 'clear_user_original'
to what it now clearly is: 'rep_stos_alternative'.

End result: if we don't have a fast 'rep stosb', at least we can have a
fast fallback for it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_64.h |   4 +-
 arch/x86/lib/clear_page_64.S      | 116 ++++++++++++++++++------------
 tools/objtool/check.c             |   2 +-
 3 files changed, 74 insertions(+), 48 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 8cc918acbabc..a0533e672496 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -83,7 +83,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
  */
 
 __must_check unsigned long
-clear_user_original(void __user *addr, unsigned long len);
+rep_stos_alternative(void __user *addr, unsigned long len);
 
 static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
 {
@@ -97,7 +97,7 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
 	asm volatile(
 		"1:\n\t"
 		ALTERNATIVE("rep stosb",
-			    "call clear_user_original", ALT_NOT(X86_FEATURE_FSRS))
+			    "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
 		"2:\n"
 	       _ASM_EXTABLE_UA(1b, 2b)
 	       : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index fcd01b9f8d50..f74a3e704a1c 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -57,59 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
  * Input:
  * rdi destination
  * rcx count
+ * rax is zero
  *
  * Output:
  * rcx: uncleared bytes or 0 if successful.
  */
-SYM_FUNC_START(clear_user_original)
-	/*
-	 * Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
-	 * i.e., no need for a 'q' suffix and thus a REX prefix.
-	 */
-	mov %ecx,%eax
-	shr $3,%rcx
-	jz .Lrest_bytes
+SYM_FUNC_START(rep_stos_alternative)
+	cmpq $64,%rcx
+	jae .Lunrolled
 
-	# do the qwords first
-	.p2align 4
-.Lqwords:
-	movq $0,(%rdi)
-	lea 8(%rdi),%rdi
-	dec %rcx
-	jnz .Lqwords
+	cmp $8,%ecx
+	jae .Lword
 
-.Lrest_bytes:
-	and $7,  %eax
-	jz .Lexit
+	testl %ecx,%ecx
+	je .Lexit
 
-	# now do the rest bytes
-.Lbytes:
-	movb $0,(%rdi)
+.Lclear_user_tail:
+0:	movb %al,(%rdi)
 	inc %rdi
-	dec %eax
-	jnz .Lbytes
-
+	dec %rcx
+	jnz .Lclear_user_tail
 .Lexit:
+	RET
+
+	_ASM_EXTABLE_UA( 0b, .Lexit)
+
+.Lword:
+1:	movq %rax,(%rdi)
+	addq $8,%rdi
+	sub $8,%ecx
+	je .Lexit
+	cmp $8,%ecx
+	jae .Lword
+	jmp .Lclear_user_tail
+
+	.p2align 4
+.Lunrolled:
+10:	movq %rax,(%rdi)
+11:	movq %rax,8(%rdi)
+12:	movq %rax,16(%rdi)
+13:	movq %rax,24(%rdi)
+14:	movq %rax,32(%rdi)
+15:	movq %rax,40(%rdi)
+16:	movq %rax,48(%rdi)
+17:	movq %rax,56(%rdi)
+	addq $64,%rdi
+	subq $64,%rcx
+	cmpq $64,%rcx
+	jae .Lunrolled
+	cmpl $8,%ecx
+	jae .Lword
+	testl %ecx,%ecx
+	jne .Lclear_user_tail
+	RET
+
 	/*
-	 * %rax still needs to be cleared in the exception case because this function is called
-	 * from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
-	 * in case it might reuse it somewhere.
+	 * If we take an exception on any of the
+	 * word stores, we know that %rcx isn't zero,
+	 * so we can just go to the tail clearing to
+	 * get the exact count.
+	 *
+	 * The unrolled case might end up clearing
+	 * some bytes twice. Don't care.
+	 *
+	 * We could use the value in %rdi to avoid
+	 * a second fault on the exact count case,
+	 * but do we really care? No.
+	 *
+	 * Finally, we could try to align %rdi at the
+	 * top of the unrolling. But unaligned stores
+	 * just aren't that common or expensive.
 	 */
-        xor %eax,%eax
-        RET
-
-.Lqwords_exception:
-        # convert remaining qwords back into bytes to return to caller
-        shl $3, %rcx
-        and $7, %eax
-        add %rax,%rcx
-        jmp .Lexit
-
-.Lbytes_exception:
-        mov %eax,%ecx
-        jmp .Lexit
-
-        _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
-        _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
-SYM_FUNC_END(clear_user_original)
-EXPORT_SYMBOL(clear_user_original)
+	_ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(10b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(12b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(13b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(14b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(15b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(16b, .Lclear_user_tail)
+	_ASM_EXTABLE_UA(17b, .Lclear_user_tail)
+SYM_FUNC_END(rep_stos_alternative)
+EXPORT_SYMBOL(rep_stos_alternative)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 44817bbe48fe..ac96c9939cd1 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1284,7 +1284,7 @@ static const char *uaccess_safe_builtin[] = {
 	"copy_mc_fragile_handle_tail",
 	"copy_mc_enhanced_fast_string",
 	"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
-	"clear_user_original",
+	"rep_stos_alternative",
 	"copy_user_generic_unrolled",
 	"__copy_user_nocache",
 	NULL

From 427fda2c8a4977d9dbd9bc108bbe6e21ec84648d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 16 Apr 2023 18:23:06 -0700
Subject: [PATCH 08/11] x86: improve on the non-rep 'copy_user' function

The old 'copy_user_generic_unrolled' function was oddly implemented for
largely historical reasons: it had been largely based on the uncached
copy case, which has some other concerns.

For example, the __copy_user_nocache() function uses 'movnti' for the
destination stores, and those want the destination to be aligned.  In
contrast, the regular copy function doesn't really care, and trying to
align things only complicates matters.

Also, like the clear_user function, the copy function had some odd
handling of the repeat counts, complicating the exception handling for
no really good reason.  So as with clear_user, just write it to keep all
the byte counts in the %rcx register, exactly like the 'rep movs'
functionality that this replaces.

Unlike a real 'rep movs', we do allow for this to trash a few temporary
registers to not have to unnecessarily save/restore registers on the
stack.

And like the clearing case, rename this to what it now clearly is:
'rep_movs_alternative', and make it one coherent function, so that it
shows up as such in profiles (instead of the odd split between
"copy_user_generic_unrolled" and "copy_user_short_string", the latter of
which was not about strings at all, and which was shared with the
uncached case).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_64.h |   8 +-
 arch/x86/lib/copy_user_64.S       | 307 ++++++++++++++----------------
 tools/objtool/check.c             |   2 +-
 3 files changed, 147 insertions(+), 170 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index a0533e672496..435ca24c5e1d 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -18,7 +18,7 @@
 
 /* Handles exceptions in both to and from, but doesn't do access_ok */
 __must_check unsigned long
-copy_user_generic_unrolled(void *to, const void *from, unsigned len);
+rep_movs_alternative(void *to, const void *from, unsigned len);
 
 static __always_inline __must_check unsigned long
 copy_user_generic(void *to, const void *from, unsigned long len)
@@ -26,16 +26,16 @@ copy_user_generic(void *to, const void *from, unsigned long len)
 	stac();
 	/*
 	 * If CPU has FSRM feature, use 'rep movs'.
-	 * Otherwise, use copy_user_generic_unrolled.
+	 * Otherwise, use rep_movs_alternative.
 	 */
 	asm volatile(
 		"1:\n\t"
 		ALTERNATIVE("rep movsb",
-			    "call copy_user_generic_unrolled", ALT_NOT(X86_FEATURE_FSRM))
+			    "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
 		"2:\n"
 		_ASM_EXTABLE_UA(1b, 2b)
 		:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
-		: : "memory", "rax", "rdx", "r8", "r9", "r10", "r11");
+		: : "memory", "rax", "r8", "r9", "r10", "r11");
 	clac();
 	return len;
 }
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 16a743f11b11..85e6c45b1ca9 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -17,6 +17,113 @@
 #include <asm/export.h>
 #include <asm/trapnr.h>
 
+/*
+ * rep_movs_alternative - memory copy with exception handling.
+ * This version is for CPUs that don't have FSRM (Fast Short Rep Movs)
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rcx count
+ *
+ * Output:
+ * rcx uncopied bytes or 0 if successful.
+ *
+ * NOTE! The calling convention is very intentionally the same as
+ * for 'rep movs', so that we can rewrite the function call with
+ * just a plain 'rep movs' on machines that have FSRM.  But to make
+ * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
+ */
+SYM_FUNC_START(rep_movs_alternative)
+	cmpq $64,%rcx
+	jae .Lunrolled
+
+	cmp $8,%ecx
+	jae .Lword
+
+	testl %ecx,%ecx
+	je .Lexit
+
+.Lcopy_user_tail:
+0:	movb (%rsi),%al
+1:	movb %al,(%rdi)
+	inc %rdi
+	inc %rsi
+	dec %rcx
+	jne .Lcopy_user_tail
+.Lexit:
+	RET
+
+	_ASM_EXTABLE_UA( 0b, .Lexit)
+	_ASM_EXTABLE_UA( 1b, .Lexit)
+
+	.p2align 4
+.Lword:
+2:	movq (%rsi),%rax
+3:	movq %rax,(%rdi)
+	addq $8,%rsi
+	addq $8,%rdi
+	sub $8,%ecx
+	je .Lexit
+	cmp $8,%ecx
+	jae .Lword
+	jmp .Lcopy_user_tail
+
+	_ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
+
+	.p2align 4
+.Lunrolled:
+10:	movq (%rsi),%r8
+11:	movq 8(%rsi),%r9
+12:	movq 16(%rsi),%r10
+13:	movq 24(%rsi),%r11
+14:	movq %r8,(%rdi)
+15:	movq %r9,8(%rdi)
+16:	movq %r10,16(%rdi)
+17:	movq %r11,24(%rdi)
+20:	movq 32(%rsi),%r8
+21:	movq 40(%rsi),%r9
+22:	movq 48(%rsi),%r10
+23:	movq 56(%rsi),%r11
+24:	movq %r8,32(%rdi)
+25:	movq %r9,40(%rdi)
+26:	movq %r10,48(%rdi)
+27:	movq %r11,56(%rdi)
+	addq $64,%rsi
+	addq $64,%rdi
+	subq $64,%rcx
+	cmpq $64,%rcx
+	jae .Lunrolled
+	cmpl $8,%ecx
+	jae .Lword
+	testl %ecx,%ecx
+	jne .Lcopy_user_tail
+	RET
+
+	_ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
+	_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
+SYM_FUNC_END(rep_movs_alternative)
+EXPORT_SYMBOL(rep_movs_alternative)
+
+/*
+ * The uncached copy needs to align the destination for
+ * movnti and friends.
+ */
 .macro ALIGN_DESTINATION
 	/* check for bad alignment of destination */
 	movl %edi,%ecx
@@ -37,171 +144,6 @@
 	_ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
 .endm
 
-/*
- * copy_user_generic_unrolled - memory copy with exception handling.
- * This version is for CPUs like P4 that don't have efficient micro
- * code for rep movsq
- *
- * Input:
- * rdi destination
- * rsi source
- * rcx count
- *
- * Output:
- * rcx uncopied bytes or 0 if successful.
- *
- * NOTE! The calling convention is very intentionally the same as
- * for 'rep movs', so that we can rewrite the function call with
- * just a plain 'rep movs' on machines that have FSRM.
- *
- * HOWEVER! This function ends up having a lot of the code common
- * with __copy_user_nocache(), which is a normal C function, and
- * has a similar calling convention, but gets the 'count' in %rdx,
- * and returns the result in %rax.
- *
- * To share as much code as possible, we end up returning the
- * result in *both* %rcx/%rax, and we also move the initial count
- * into %rdx.
- *
- * We can clobber rdx/rsi/rdi and r8-r11
- */
-SYM_FUNC_START(copy_user_generic_unrolled)
-	movl %ecx,%edx
-	cmpl $8,%ecx
-	jb .Lcopy_user_short_string_bytes
-	ALIGN_DESTINATION
-	movl %edx,%ecx
-	andl $63,%edx
-	shrl $6,%ecx
-	jz copy_user_short_string
-1:	movq (%rsi),%r8
-2:	movq 1*8(%rsi),%r9
-3:	movq 2*8(%rsi),%r10
-4:	movq 3*8(%rsi),%r11
-5:	movq %r8,(%rdi)
-6:	movq %r9,1*8(%rdi)
-7:	movq %r10,2*8(%rdi)
-8:	movq %r11,3*8(%rdi)
-9:	movq 4*8(%rsi),%r8
-10:	movq 5*8(%rsi),%r9
-11:	movq 6*8(%rsi),%r10
-12:	movq 7*8(%rsi),%r11
-13:	movq %r8,4*8(%rdi)
-14:	movq %r9,5*8(%rdi)
-15:	movq %r10,6*8(%rdi)
-16:	movq %r11,7*8(%rdi)
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
-	decl %ecx
-	jnz 1b
-	jmp copy_user_short_string
-
-30:	shll $6,%ecx
-	addl %ecx,%edx
-	jmp .Lcopy_user_handle_tail
-
-	_ASM_EXTABLE_CPY(1b, 30b)
-	_ASM_EXTABLE_CPY(2b, 30b)
-	_ASM_EXTABLE_CPY(3b, 30b)
-	_ASM_EXTABLE_CPY(4b, 30b)
-	_ASM_EXTABLE_CPY(5b, 30b)
-	_ASM_EXTABLE_CPY(6b, 30b)
-	_ASM_EXTABLE_CPY(7b, 30b)
-	_ASM_EXTABLE_CPY(8b, 30b)
-	_ASM_EXTABLE_CPY(9b, 30b)
-	_ASM_EXTABLE_CPY(10b, 30b)
-	_ASM_EXTABLE_CPY(11b, 30b)
-	_ASM_EXTABLE_CPY(12b, 30b)
-	_ASM_EXTABLE_CPY(13b, 30b)
-	_ASM_EXTABLE_CPY(14b, 30b)
-	_ASM_EXTABLE_CPY(15b, 30b)
-	_ASM_EXTABLE_CPY(16b, 30b)
-SYM_FUNC_END(copy_user_generic_unrolled)
-EXPORT_SYMBOL(copy_user_generic_unrolled)
-
-/*
- * Try to copy last bytes and clear the rest if needed.
- * Since protection fault in copy_from/to_user is not a normal situation,
- * it is not necessary to optimize tail handling.
- * Don't try to copy the tail if machine check happened
- *
- * Input:
- * eax trap number written by ex_handler_copy()
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
-	cmp $X86_TRAP_MC,%eax
-	je 3f
-
-	movl %edx,%ecx
-1:	rep movsb
-2:	mov %ecx,%eax
-	RET
-
-3:
-	movl %edx,%eax
-	movl %edx,%ecx
-	RET
-
-	_ASM_EXTABLE_CPY(1b, 2b)
-
-.Lcopy_user_handle_align:
-	addl %ecx,%edx			/* ecx is zerorest also */
-	jmp .Lcopy_user_handle_tail
-
-SYM_CODE_END(.Lcopy_user_handle_tail)
-
-/*
- * Finish memcpy of less than 64 bytes.  #AC should already be set.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count (< 64)
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_CODE_START_LOCAL(copy_user_short_string)
-	movl %edx,%ecx
-	andl $7,%edx
-	shrl $3,%ecx
-	jz .Lcopy_user_short_string_bytes
-18:	movq (%rsi),%r8
-19:	movq %r8,(%rdi)
-	leaq 8(%rsi),%rsi
-	leaq 8(%rdi),%rdi
-	decl %ecx
-	jnz 18b
-.Lcopy_user_short_string_bytes:
-	andl %edx,%edx
-	jz 23f
-	movl %edx,%ecx
-21:	movb (%rsi),%al
-22:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 21b
-23:	xor %eax,%eax
-	xor %ecx,%ecx
-	RET
-
-40:	leal (%rdx,%rcx,8),%edx
-	jmp 60f
-50:	movl %ecx,%edx		/* ecx is zerorest also */
-60:	jmp .Lcopy_user_handle_tail
-
-	_ASM_EXTABLE_CPY(18b, 40b)
-	_ASM_EXTABLE_CPY(19b, 40b)
-	_ASM_EXTABLE_CPY(21b, 50b)
-	_ASM_EXTABLE_CPY(22b, 50b)
-SYM_CODE_END(copy_user_short_string)
 
 /*
  * copy_user_nocache - Uncached memory copy with exception handling
@@ -346,5 +288,40 @@ SYM_FUNC_START(__copy_user_nocache)
 	_ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
 	_ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
 	_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
+
+/*
+ * Try to copy last bytes and clear the rest if needed.
+ * Since protection fault in copy_from/to_user is not a normal situation,
+ * it is not necessary to optimize tail handling.
+ * Don't try to copy the tail if machine check happened
+ *
+ * Input:
+ * eax trap number written by ex_handler_copy()
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+.Lcopy_user_handle_tail:
+	cmp $X86_TRAP_MC,%eax
+	je 3f
+
+	movl %edx,%ecx
+1:	rep movsb
+2:	mov %ecx,%eax
+	RET
+
+3:
+	movl %edx,%eax
+	RET
+
+	_ASM_EXTABLE_CPY(1b, 2b)
+
+.Lcopy_user_handle_align:
+	addl %ecx,%edx			/* ecx is zerorest also */
+	jmp .Lcopy_user_handle_tail
+
 SYM_FUNC_END(__copy_user_nocache)
 EXPORT_SYMBOL(__copy_user_nocache)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index ac96c9939cd1..50ed63f701f1 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1285,7 +1285,7 @@ static const char *uaccess_safe_builtin[] = {
 	"copy_mc_enhanced_fast_string",
 	"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
 	"rep_stos_alternative",
-	"copy_user_generic_unrolled",
+	"rep_movs_alternative",
 	"__copy_user_nocache",
 	NULL
 };

From e046fe5a36a970bc14fbfbcb2074a48776f6b671 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 17 Apr 2023 12:13:35 -0700
Subject: [PATCH 09/11] x86: set FSRS automatically on AMD CPUs that have FSRM

So Intel introduced the FSRS ("Fast Short REP STOS") CPU capability bit,
because they seem to have done the (much simpler) REP STOS optimizations
separately and later than the REP MOVS one.

In contrast, when AMD introduced support for FSRM ("Fast Short REP
MOVS"), in the Zen 3 core, it appears to have improved the REP STOS case
at the same time, and since the FSRS bit was added by Intel later, it
doesn't show up on those AMD Zen 3 cores.

And now that we made use of FSRS for the "rep stos" conditional, that
made those AMD machines unnecessarily slower.  The Intel situation where
"rep movs" is fast, but "rep stos" isn't, is just odd.  The 'stos' case
is a lot simpler with no aliasing, no mutual alignment issues, no
complicated cases.

So this just sets FSRS automatically when FSRM is available on AMD
machines, to get back all the nice REP STOS goodness in Zen 3.

Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/amd.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 95cdd08c4cbb..1547781e505b 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -929,6 +929,10 @@ static void init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 0x10)
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 
+	/* AMD FSRM also implies FSRS */
+	if (cpu_has(c, X86_FEATURE_FSRM))
+		set_cpu_cap(c, X86_FEATURE_FSRS);
+
 	/* get apicid instead of initial apic id from cpuid */
 	c->apicid = hard_smp_processor_id();
 

From e1f2750edc4afebb966a229b797fc89b98ee6098 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 19 Apr 2023 19:09:52 -0700
Subject: [PATCH 10/11] x86: remove 'zerorest' argument from
 __copy_user_nocache()

Every caller passes in zero, meaning they don't want any partial copy to
zero the remainder of the destination buffer.

Which is just as well, because the implementation of that function
didn't actually even look at that argument, and wasn't even aware it
existed, although some misleading comments did mention it still.

The 'zerorest' thing is a historical artifact of how "copy_from_user()"
worked, in that it would zero the rest of the kernel buffer that it
copied into.

That zeroing still exists, but it's long since been moved to generic
code, and the raw architecture-specific code doesn't do it.  See
_copy_from_user() in lib/usercopy.c for this all.

However, while __copy_user_nocache() shares some history and superficial
other similarities with copy_from_user(), it is in many ways also very
different.

In particular, while the code makes it *look* similar to the generic
user copy functions that can copy both to and from user space, and take
faults on both reads and writes as a result, __copy_user_nocache() does
no such thing at all.

__copy_user_nocache() always copies to kernel space, and will never take
a page fault on the destination.  What *can* happen, though, is that the
non-temporal stores take a machine check because one of the use cases is
for writing to stable memory, and any memory errors would then take
synchronous faults.

So __copy_user_nocache() does look a lot like copy_from_user(), but has
faulting behavior that is more akin to our old copy_in_user() (which no
longer exists, but copied from user space to user space and could fault
on both source and destination).

And it very much does not have the "zero the end of the destination
buffer", since a problem with the destination buffer is very possibly
the very source of the partial copy.

So this whole thing was just a confusing historical artifact from having
shared some code with a completely different function with completely
different use cases.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_64.h | 6 ++----
 arch/x86/lib/copy_user_64.S       | 4 ++--
 arch/x86/lib/usercopy_64.c        | 2 +-
 drivers/infiniband/sw/rdmavt/qp.c | 2 +-
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 435ca24c5e1d..0a49a8de9f3c 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -52,9 +52,7 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
 	return copy_user_generic((__force void *)dst, src, size);
 }
 
-extern long __copy_user_nocache(void *dst, const void __user *src,
-				unsigned size, int zerorest);
-
+extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size);
 extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
 extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
 			   size_t len);
@@ -66,7 +64,7 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 	long ret;
 	kasan_check_write(dst, size);
 	stac();
-	ret = __copy_user_nocache(dst, src, size, 0);
+	ret = __copy_user_nocache(dst, src, size);
 	clac();
 	return ret;
 }
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 85e6c45b1ca9..d424fb75e0f0 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -290,7 +290,7 @@ SYM_FUNC_START(__copy_user_nocache)
 	_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
 
 /*
- * Try to copy last bytes and clear the rest if needed.
+ * Try to copy last bytes.
  * Since protection fault in copy_from/to_user is not a normal situation,
  * it is not necessary to optimize tail handling.
  * Don't try to copy the tail if machine check happened
@@ -320,7 +320,7 @@ SYM_FUNC_START(__copy_user_nocache)
 	_ASM_EXTABLE_CPY(1b, 2b)
 
 .Lcopy_user_handle_align:
-	addl %ecx,%edx			/* ecx is zerorest also */
+	addl %ecx,%edx
 	jmp .Lcopy_user_handle_tail
 
 SYM_FUNC_END(__copy_user_nocache)
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 15704c605a2b..c3a5bbc0b41e 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -48,7 +48,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
 	long rc;
 
 	stac();
-	rc = __copy_user_nocache(dst, src, size, 0);
+	rc = __copy_user_nocache(dst, src, size);
 	clac();
 
 	/*
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 3acab569fbb9..9b4c0389d2c0 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -97,7 +97,7 @@ static void cacheless_memcpy(void *dst, void *src, size_t n)
 	 * there are no security issues.  The extra fault recovery machinery
 	 * is not invoked.
 	 */
-	__copy_user_nocache(dst, (void __user *)src, n, 0);
+	__copy_user_nocache(dst, (void __user *)src, n);
 }
 
 void rvt_wss_exit(struct rvt_dev_info *rdi)

From 034ff37d34071ff3f48755f728cd229e42a4f15d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 20 Apr 2023 15:13:50 -0700
Subject: [PATCH 11/11] x86: rewrite '__copy_user_nocache' function

I didn't really want to do this, but as part of all the other changes to
the user copy loops, I've been looking at this horror.

I tried to clean it up multiple times, but every time I just found more
problems, and the way it's written, it's just too hard to fix them.

For example, the code is written to do quad-word alignment, and will use
regular byte accesses to get to that point.  That's fairly simple, but
it means that any initial 8-byte alignment will be done with cached
copies.

However, the code then is very careful to do any 4-byte _tail_ accesses
using an uncached 4-byte write, and that was claimed to be relevant in
commit a82eee742452 ("x86/uaccess/64: Handle the caching of 4-byte
nocache copies properly in __copy_user_nocache()").

So if you do a 4-byte copy using that function, it carefully uses a
4-byte 'movnti' for the destination.  But if you were to do a 12-byte
copy that is 4-byte aligned, it would _not_ do a 4-byte 'movnti'
followed by a 8-byte 'movnti' to keep it all uncached.

Instead, it would align the destination to 8 bytes using a
byte-at-a-time loop, and then do a 8-byte 'movnti' for the final 8
bytes.

The main caller that cares is __copy_user_flushcache(), which knows
about this insanity, and has odd cases for it all.  But I just can't
deal with looking at this kind of "it does one case right, and another
related case entirely wrong".

And the code really wasn't fixable without hard drugs, which I try to
avoid.

So instead, rewrite it in a form that hopefully not only gets this
right, but is a bit more maintainable.  Knock wood.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/lib/Makefile                |   2 +-
 arch/x86/lib/copy_user_64.S          | 213 -----------------------
 arch/x86/lib/copy_user_uncached_64.S | 242 +++++++++++++++++++++++++++
 3 files changed, 243 insertions(+), 214 deletions(-)
 create mode 100644 arch/x86/lib/copy_user_uncached_64.S

diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 4f1a40a86534..01932af64193 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -71,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y)
 endif
         lib-y += clear_page_64.o copy_page_64.o
         lib-y += memmove_64.o memset_64.o
-        lib-y += copy_user_64.o
+        lib-y += copy_user_64.o copy_user_uncached_64.o
 	lib-y += cmpxchg16b_emu.o
 endif
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index d424fb75e0f0..4fc5c2de2de4 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -7,15 +7,8 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/cpufeatures.h>
-#include <asm/alternative.h>
 #include <asm/asm.h>
-#include <asm/smap.h>
 #include <asm/export.h>
-#include <asm/trapnr.h>
 
 /*
  * rep_movs_alternative - memory copy with exception handling.
@@ -119,209 +112,3 @@ SYM_FUNC_START(rep_movs_alternative)
 	_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
 SYM_FUNC_END(rep_movs_alternative)
 EXPORT_SYMBOL(rep_movs_alternative)
-
-/*
- * The uncached copy needs to align the destination for
- * movnti and friends.
- */
-.macro ALIGN_DESTINATION
-	/* check for bad alignment of destination */
-	movl %edi,%ecx
-	andl $7,%ecx
-	jz 102f				/* already aligned */
-	subl $8,%ecx
-	negl %ecx
-	subl %ecx,%edx
-100:	movb (%rsi),%al
-101:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 100b
-102:
-
-	_ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
-	_ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
-.endm
-
-
-/*
- * copy_user_nocache - Uncached memory copy with exception handling
- * This will force destination out of cache for more performance.
- *
- * Note: Cached memory copy is used when destination or size is not
- * naturally aligned. That is:
- *  - Require 8-byte alignment when size is 8 bytes or larger.
- *  - Require 4-byte alignment when size is 4 bytes.
- */
-SYM_FUNC_START(__copy_user_nocache)
-	/* If size is less than 8 bytes, go to 4-byte copy */
-	cmpl $8,%edx
-	jb .L_4b_nocache_copy_entry
-
-	/* If destination is not 8-byte aligned, "cache" copy to align it */
-	ALIGN_DESTINATION
-
-	/* Set 4x8-byte copy count and remainder */
-	movl %edx,%ecx
-	andl $63,%edx
-	shrl $6,%ecx
-	jz .L_8b_nocache_copy_entry	/* jump if count is 0 */
-
-	/* Perform 4x8-byte nocache loop-copy */
-.L_4x8b_nocache_copy_loop:
-1:	movq (%rsi),%r8
-2:	movq 1*8(%rsi),%r9
-3:	movq 2*8(%rsi),%r10
-4:	movq 3*8(%rsi),%r11
-5:	movnti %r8,(%rdi)
-6:	movnti %r9,1*8(%rdi)
-7:	movnti %r10,2*8(%rdi)
-8:	movnti %r11,3*8(%rdi)
-9:	movq 4*8(%rsi),%r8
-10:	movq 5*8(%rsi),%r9
-11:	movq 6*8(%rsi),%r10
-12:	movq 7*8(%rsi),%r11
-13:	movnti %r8,4*8(%rdi)
-14:	movnti %r9,5*8(%rdi)
-15:	movnti %r10,6*8(%rdi)
-16:	movnti %r11,7*8(%rdi)
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
-	decl %ecx
-	jnz .L_4x8b_nocache_copy_loop
-
-	/* Set 8-byte copy count and remainder */
-.L_8b_nocache_copy_entry:
-	movl %edx,%ecx
-	andl $7,%edx
-	shrl $3,%ecx
-	jz .L_4b_nocache_copy_entry	/* jump if count is 0 */
-
-	/* Perform 8-byte nocache loop-copy */
-.L_8b_nocache_copy_loop:
-20:	movq (%rsi),%r8
-21:	movnti %r8,(%rdi)
-	leaq 8(%rsi),%rsi
-	leaq 8(%rdi),%rdi
-	decl %ecx
-	jnz .L_8b_nocache_copy_loop
-
-	/* If no byte left, we're done */
-.L_4b_nocache_copy_entry:
-	andl %edx,%edx
-	jz .L_finish_copy
-
-	/* If destination is not 4-byte aligned, go to byte copy: */
-	movl %edi,%ecx
-	andl $3,%ecx
-	jnz .L_1b_cache_copy_entry
-
-	/* Set 4-byte copy count (1 or 0) and remainder */
-	movl %edx,%ecx
-	andl $3,%edx
-	shrl $2,%ecx
-	jz .L_1b_cache_copy_entry	/* jump if count is 0 */
-
-	/* Perform 4-byte nocache copy: */
-30:	movl (%rsi),%r8d
-31:	movnti %r8d,(%rdi)
-	leaq 4(%rsi),%rsi
-	leaq 4(%rdi),%rdi
-
-	/* If no bytes left, we're done: */
-	andl %edx,%edx
-	jz .L_finish_copy
-
-	/* Perform byte "cache" loop-copy for the remainder */
-.L_1b_cache_copy_entry:
-	movl %edx,%ecx
-.L_1b_cache_copy_loop:
-40:	movb (%rsi),%al
-41:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz .L_1b_cache_copy_loop
-
-	/* Finished copying; fence the prior stores */
-.L_finish_copy:
-	xorl %eax,%eax
-	sfence
-	RET
-
-.L_fixup_4x8b_copy:
-	shll $6,%ecx
-	addl %ecx,%edx
-	jmp .L_fixup_handle_tail
-.L_fixup_8b_copy:
-	lea (%rdx,%rcx,8),%rdx
-	jmp .L_fixup_handle_tail
-.L_fixup_4b_copy:
-	lea (%rdx,%rcx,4),%rdx
-	jmp .L_fixup_handle_tail
-.L_fixup_1b_copy:
-	movl %ecx,%edx
-.L_fixup_handle_tail:
-	sfence
-	jmp .Lcopy_user_handle_tail
-
-	_ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy)
-	_ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy)
-	_ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy)
-	_ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy)
-	_ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
-	_ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
-	_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
-
-/*
- * Try to copy last bytes.
- * Since protection fault in copy_from/to_user is not a normal situation,
- * it is not necessary to optimize tail handling.
- * Don't try to copy the tail if machine check happened
- *
- * Input:
- * eax trap number written by ex_handler_copy()
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-.Lcopy_user_handle_tail:
-	cmp $X86_TRAP_MC,%eax
-	je 3f
-
-	movl %edx,%ecx
-1:	rep movsb
-2:	mov %ecx,%eax
-	RET
-
-3:
-	movl %edx,%eax
-	RET
-
-	_ASM_EXTABLE_CPY(1b, 2b)
-
-.Lcopy_user_handle_align:
-	addl %ecx,%edx
-	jmp .Lcopy_user_handle_tail
-
-SYM_FUNC_END(__copy_user_nocache)
-EXPORT_SYMBOL(__copy_user_nocache)
diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S
new file mode 100644
index 000000000000..5c5f38d32672
--- /dev/null
+++ b/arch/x86/lib/copy_user_uncached_64.S
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/export.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ *
+ * This copies from user space into kernel space, but the kernel
+ * space accesses can take a machine check exception, so they too
+ * need exception handling.
+ *
+ * Note: only 32-bit and 64-bit stores have non-temporal versions,
+ * and we only use aligned versions. Any unaligned parts at the
+ * start or end of the copy will be done using normal cached stores.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * edx count
+ *
+ * Output:
+ * rax uncopied bytes or 0 if successful.
+ */
+SYM_FUNC_START(__copy_user_nocache)
+	/* If destination is not 7-byte aligned, we'll have to align it */
+	testb $7,%dil
+	jne .Lalign
+
+.Lis_aligned:
+	cmp $64,%edx
+	jb .Lquadwords
+
+	.p2align 4,0x90
+.Lunrolled:
+10:	movq (%rsi),%r8
+11:	movq 8(%rsi),%r9
+12:	movq 16(%rsi),%r10
+13:	movq 24(%rsi),%r11
+20:	movnti %r8,(%rdi)
+21:	movnti %r9,8(%rdi)
+22:	movnti %r10,16(%rdi)
+23:	movnti %r11,24(%rdi)
+30:	movq 32(%rsi),%r8
+31:	movq 40(%rsi),%r9
+32:	movq 48(%rsi),%r10
+33:	movq 56(%rsi),%r11
+40:	movnti %r8,32(%rdi)
+41:	movnti %r9,40(%rdi)
+42:	movnti %r10,48(%rdi)
+43:	movnti %r11,56(%rdi)
+
+	addq $64,%rsi
+	addq $64,%rdi
+	sub $64,%edx
+	cmp $64,%edx
+	jae .Lunrolled
+
+/*
+ * First set of user mode loads have been done
+ * without any stores, so if they fail, we can
+ * just try the non-unrolled loop.
+ */
+_ASM_EXTABLE_UA(10b, .Lquadwords)
+_ASM_EXTABLE_UA(11b, .Lquadwords)
+_ASM_EXTABLE_UA(12b, .Lquadwords)
+_ASM_EXTABLE_UA(13b, .Lquadwords)
+
+/*
+ * The second set of user mode loads have been
+ * done with 32 bytes stored to the destination,
+ * so we need to take that into account before
+ * falling back to the unrolled loop.
+ */
+_ASM_EXTABLE_UA(30b, .Lfixup32)
+_ASM_EXTABLE_UA(31b, .Lfixup32)
+_ASM_EXTABLE_UA(32b, .Lfixup32)
+_ASM_EXTABLE_UA(33b, .Lfixup32)
+
+/*
+ * An exception on a write means that we're
+ * done, but we need to update the count
+ * depending on where in the unrolled loop
+ * we were.
+ */
+_ASM_EXTABLE_UA(20b, .Ldone0)
+_ASM_EXTABLE_UA(21b, .Ldone8)
+_ASM_EXTABLE_UA(22b, .Ldone16)
+_ASM_EXTABLE_UA(23b, .Ldone24)
+_ASM_EXTABLE_UA(40b, .Ldone32)
+_ASM_EXTABLE_UA(41b, .Ldone40)
+_ASM_EXTABLE_UA(42b, .Ldone48)
+_ASM_EXTABLE_UA(43b, .Ldone56)
+
+.Lquadwords:
+	cmp $8,%edx
+	jb .Llong
+50:	movq (%rsi),%rax
+51:	movnti %rax,(%rdi)
+	addq $8,%rsi
+	addq $8,%rdi
+	sub $8,%edx
+	jmp .Lquadwords
+
+/*
+ * If we fail on the last full quadword, we will
+ * not try to do any byte-wise cached accesses.
+ * We will try to do one more 4-byte uncached
+ * one, though.
+ */
+_ASM_EXTABLE_UA(50b, .Llast4)
+_ASM_EXTABLE_UA(51b, .Ldone0)
+
+.Llong:
+	test $4,%dl
+	je .Lword
+60:	movl (%rsi),%eax
+61:	movnti %eax,(%rdi)
+	addq $4,%rsi
+	addq $4,%rdi
+	sub $4,%edx
+.Lword:
+	sfence
+	test $2,%dl
+	je .Lbyte
+70:	movw (%rsi),%ax
+71:	movw %ax,(%rdi)
+	addq $2,%rsi
+	addq $2,%rdi
+	sub $2,%edx
+.Lbyte:
+	test $1,%dl
+	je .Ldone
+80:	movb (%rsi),%al
+81:	movb %al,(%rdi)
+	dec %edx
+.Ldone:
+	mov %edx,%eax
+	RET
+
+/*
+ * If we fail on the last four bytes, we won't
+ * bother with any fixups. It's dead, Jim. Note
+ * that there's no need for 'sfence' for any
+ * of this, since the exception will have been
+ * serializing.
+ */
+_ASM_EXTABLE_UA(60b, .Ldone)
+_ASM_EXTABLE_UA(61b, .Ldone)
+_ASM_EXTABLE_UA(70b, .Ldone)
+_ASM_EXTABLE_UA(71b, .Ldone)
+_ASM_EXTABLE_UA(80b, .Ldone)
+_ASM_EXTABLE_UA(81b, .Ldone)
+
+/*
+ * This is the "head needs aliging" case when
+ * the destination isn't 8-byte aligned. The
+ * 4-byte case can be done uncached, but any
+ * smaller alignment is done with regular stores.
+ */
+.Lalign:
+	test $1,%dil
+	je .Lalign_word
+	test %edx,%edx
+	je .Ldone
+90:	movb (%rsi),%al
+91:	movb %al,(%rdi)
+	inc %rsi
+	inc %rdi
+	dec %edx
+.Lalign_word:
+	test $2,%dil
+	je .Lalign_long
+	cmp $2,%edx
+	jb .Lbyte
+92:	movw (%rsi),%ax
+93:	movw %ax,(%rdi)
+	addq $2,%rsi
+	addq $2,%rdi
+	sub $2,%edx
+.Lalign_long:
+	test $4,%dil
+	je .Lis_aligned
+	cmp $4,%edx
+	jb .Lword
+94:	movl (%rsi),%eax
+95:	movnti %eax,(%rdi)
+	addq $4,%rsi
+	addq $4,%rdi
+	sub $4,%edx
+	jmp .Lis_aligned
+
+/*
+ * If we fail on the initial alignment accesses,
+ * we're all done. Again, no point in trying to
+ * do byte-by-byte probing if the 4-byte load
+ * fails - we're not doing any uncached accesses
+ * any more.
+ */
+_ASM_EXTABLE_UA(90b, .Ldone)
+_ASM_EXTABLE_UA(91b, .Ldone)
+_ASM_EXTABLE_UA(92b, .Ldone)
+_ASM_EXTABLE_UA(93b, .Ldone)
+_ASM_EXTABLE_UA(94b, .Ldone)
+_ASM_EXTABLE_UA(95b, .Ldone)
+
+/*
+ * Exception table fixups for faults in the middle
+ */
+.Ldone56: sub $8,%edx
+.Ldone48: sub $8,%edx
+.Ldone40: sub $8,%edx
+.Ldone32: sub $8,%edx
+.Ldone24: sub $8,%edx
+.Ldone16: sub $8,%edx
+.Ldone8: sub $8,%edx
+.Ldone0:
+	mov %edx,%eax
+	RET
+
+.Lfixup32:
+	addq $32,%rsi
+	addq $32,%rdi
+	sub $32,%edx
+	jmp .Lquadwords
+
+.Llast4:
+52:	movl (%rsi),%eax
+53:	movnti %eax,(%rdi)
+	sfence
+	sub $4,%edx
+	mov %edx,%eax
+	RET
+_ASM_EXTABLE_UA(52b, .Ldone0)
+_ASM_EXTABLE_UA(53b, .Ldone0)
+
+SYM_FUNC_END(__copy_user_nocache)
+EXPORT_SYMBOL(__copy_user_nocache)