I noticed ksm spending quite a lot of time in memcmp on a large KVM box. The current memcmp loop is very unoptimised - byte at a time compares with no loop unrolling. We can do much much better. Optimise the loop in a few ways: - Unroll the byte at a time loop - For large (at least 32 byte) comparisons that are also 8 byte aligned, use an unrolled modulo scheduled loop using 8 byte loads. This is similar to our glibc memcmp. A simple microbenchmark testing 10000000 iterations of an 8192 byte memcmp was used to measure the performance: baseline: 29.93 s modified: 1.70 s Just over 17x faster. v2: Incorporated some suggestions from Segher: - Use andi. instead of rdlicl. - Convert bdnzt eq, to bdnz. It's just duplicating the earlier compare and was a relic from a previous version. - Don't use cr5, we have plans to use that CR field for fast local atomics. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
42 lines
1.0 KiB
Makefile
42 lines
1.0 KiB
Makefile
#
|
|
# Makefile for ppc-specific library files..
|
|
#
|
|
|
|
subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
|
|
|
|
ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
|
|
|
|
CFLAGS_REMOVE_code-patching.o = -pg
|
|
CFLAGS_REMOVE_feature-fixups.o = -pg
|
|
|
|
obj-y := string.o alloc.o \
|
|
crtsavres.o ppc_ksyms.o
|
|
obj-$(CONFIG_PPC32) += div64.o copy_32.o
|
|
|
|
obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \
|
|
usercopy_64.o mem_64.o hweight_64.o \
|
|
copyuser_power7.o string_64.o copypage_power7.o \
|
|
memcmp_64.o
|
|
ifeq ($(CONFIG_GENERIC_CSUM),)
|
|
obj-y += checksum_$(CONFIG_WORD_SIZE).o
|
|
obj-$(CONFIG_PPC64) += checksum_wrappers_64.o
|
|
endif
|
|
|
|
obj-$(CONFIG_PPC64) += memcpy_power7.o memcpy_64.o
|
|
|
|
obj-$(CONFIG_PPC_EMULATE_SSTEP) += sstep.o ldstfp.o
|
|
|
|
ifeq ($(CONFIG_PPC64),y)
|
|
obj-$(CONFIG_SMP) += locks.o
|
|
obj-$(CONFIG_ALTIVEC) += vmx-helper.o
|
|
endif
|
|
|
|
obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
|
|
|
|
obj-y += code-patching.o
|
|
obj-y += feature-fixups.o
|
|
obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o
|
|
|
|
obj-$(CONFIG_ALTIVEC) += xor_vmx.o
|
|
CFLAGS_xor_vmx.o += -maltivec -mabi=altivec
|