From 01341fbd0d8d4e717fc1231cdffe00343088ce0b Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Thu, 19 Nov 2020 14:21:25 +0800 Subject: [PATCH 001/547] workqueue: Kick a worker based on the actual activation of delayed works In realtime scenario, We do not want to have interference on the isolated cpu cores. but when invoking alloc_workqueue() for percpu wq on the housekeeping cpu, it kick a kworker on the isolated cpu. alloc_workqueue pwq_adjust_max_active wake_up_worker The comment in pwq_adjust_max_active() said: "Need to kick a worker after thawed or an unbound wq's max_active is bumped" So it is unnecessary to kick a kworker for percpu's wq when invoking alloc_workqueue(). this patch only kick a worker based on the actual activation of delayed works. Signed-off-by: Yunfeng Ye Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 437935e7a199..0695c7895c89 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3728,17 +3728,24 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) * is updated and visible. */ if (!freezable || !workqueue_freezing) { + bool kick = false; + pwq->max_active = wq->saved_max_active; while (!list_empty(&pwq->delayed_works) && - pwq->nr_active < pwq->max_active) + pwq->nr_active < pwq->max_active) { pwq_activate_first_delayed(pwq); + kick = true; + } /* * Need to kick a worker after thawed or an unbound wq's - * max_active is bumped. It's a slow path. Do it always. + * max_active is bumped. In realtime scenarios, always kicking a + * worker will cause interference on the isolated cpu cores, so + * let's kick iff work items were activated. */ - wake_up_worker(pwq->pool); + if (kick) + wake_up_worker(pwq->pool); } else { pwq->max_active = 0; } From 58315c96651152b9f438e5e56c910994234e2c7a Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Mon, 9 Nov 2020 16:01:11 +0530 Subject: [PATCH 002/547] kernel: cgroup: Mundane spelling fixes throughout the file Few spelling fixes throughout the file. Signed-off-by: Bhaskar Chowdhury Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e41c21819ba0..690f477d537c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -244,7 +244,7 @@ bool cgroup_ssid_enabled(int ssid) * * The default hierarchy is the v2 interface of cgroup and this function * can be used to test whether a cgroup is on the default hierarchy for - * cases where a subsystem should behave differnetly depending on the + * cases where a subsystem should behave differently depending on the * interface version. * * List of changed behaviors: @@ -262,7 +262,7 @@ bool cgroup_ssid_enabled(int ssid) * "cgroup.procs" instead. * * - "cgroup.procs" is not sorted. pids will be unique unless they got - * recycled inbetween reads. + * recycled in-between reads. * * - "release_agent" and "notify_on_release" are removed. Replacement * notification mechanism will be implemented. @@ -345,7 +345,7 @@ static bool cgroup_is_mixable(struct cgroup *cgrp) return !cgroup_parent(cgrp); } -/* can @cgrp become a thread root? should always be true for a thread root */ +/* can @cgrp become a thread root? Should always be true for a thread root */ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) { /* mixables don't care */ @@ -530,7 +530,7 @@ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, * the root css is returned, so this function always returns a valid css. * * The returned css is not guaranteed to be online, and therefore it is the - * callers responsiblity to tryget a reference for it. + * callers responsibility to try get a reference for it. */ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) @@ -702,7 +702,7 @@ EXPORT_SYMBOL_GPL(of_css); ; \ else -/* walk live descendants in preorder */ +/* walk live descendants in pre order */ #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ @@ -936,7 +936,7 @@ void put_css_set_locked(struct css_set *cset) WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); - /* This css_set is dead. unlink it and release cgroup and css refs */ + /* This css_set is dead. Unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); css_put(cset->subsys[ssid]); @@ -1061,7 +1061,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, /* * Build the set of subsystem state objects that we want to see in the - * new css_set. while subsystems can change globally, the entries here + * new css_set. While subsystems can change globally, the entries here * won't change, so no need for locking. */ for_each_subsys(ss, i) { @@ -1151,7 +1151,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, /* * Always add links to the tail of the lists so that the lists are - * in choronological order. + * in chronological order. */ list_move_tail(&link->cset_link, &cgrp->cset_links); list_add_tail(&link->cgrp_link, &cset->cgrp_links); @@ -4137,7 +4137,7 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, * implies that if we observe !CSS_RELEASED on @pos in this RCU * critical section, the one pointed to by its next pointer is * guaranteed to not have finished its RCU grace period even if we - * have dropped rcu_read_lock() inbetween iterations. + * have dropped rcu_read_lock() in-between iterations. * * If @pos has CSS_RELEASED set, its next pointer can't be * dereferenced; however, as each css is given a monotonically @@ -4385,7 +4385,7 @@ static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) } /** - * css_task_iter_advance_css_set - advance a task itererator to the next css_set + * css_task_iter_advance_css_set - advance a task iterator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. @@ -6320,7 +6320,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) * * Find the cgroup at @path on the default hierarchy, increment its * reference count and return it. Returns pointer to the found cgroup on - * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) + * success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR) * if @path points to a non-directory. */ struct cgroup *cgroup_get_from_path(const char *path) From 5a7b5f32c5aa628841502d19a813c633ff6ecbe4 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Fri, 6 Nov 2020 22:47:40 +0800 Subject: [PATCH 003/547] cgroup/cgroup.c: replace 'of->kn->priv' with of_cft() we have supplied the inline function: of_cft() in cgroup.h. So replace the direct use 'of->kn->priv' with inline func of_cft(), which is more readable. Signed-off-by: Hui Su Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 690f477d537c..385f80cf7d0c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3657,7 +3657,7 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, static int cgroup_file_open(struct kernfs_open_file *of) { - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); if (cft->open) return cft->open(of); @@ -3666,7 +3666,7 @@ static int cgroup_file_open(struct kernfs_open_file *of) static void cgroup_file_release(struct kernfs_open_file *of) { - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); if (cft->release) cft->release(of); @@ -3677,7 +3677,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *cgrp = of->kn->parent->priv; - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; @@ -3727,7 +3727,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) { - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); if (cft->poll) return cft->poll(of, pt); From 665f1388bc9713c81989dda6eed5cde52d57c255 Mon Sep 17 00:00:00 2001 From: Andrey Zhizhikin Date: Mon, 30 Nov 2020 12:42:33 +0000 Subject: [PATCH 004/547] ARM: omap2plus_defconfig: drop unused POWER_AVS option Commit 785b5bb41b0a ("PM: AVS: Drop the avs directory and the corresponding Kconfig") moved AVS code to SOC-specific folders, and removed corresponding Kconfig from drivers/power, leaving original POWER_AVS config option enabled in omap2plus_defconfig file. Remove the option, which has no references in the tree anymore. Fixes: 785b5bb41b0a ("PM: AVS: Drop the avs directory and the corresponding Kconfig") Signed-off-by: Andrey Zhizhikin Reviewed-by: Krzysztof Kozlowski Reviewed-by: Nishanth Menon Cc: Nishanth Menon Cc: Ulf Hansson Signed-off-by: Tony Lindgren --- arch/arm/configs/omap2plus_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 77716f500813..23b53526c4eb 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -297,7 +297,6 @@ CONFIG_GPIO_TWL4030=y CONFIG_W1=m CONFIG_HDQ_MASTER_OMAP=m CONFIG_W1_SLAVE_DS250X=m -CONFIG_POWER_AVS=y CONFIG_POWER_RESET=y CONFIG_POWER_RESET_GPIO=y CONFIG_BATTERY_BQ27XXX=m From f1dc15cd7fc146107cad2a926d9c1d005f69002a Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Sun, 29 Nov 2020 16:47:10 +0200 Subject: [PATCH 005/547] ARM: dts: OMAP3: disable AES on N950/N9 AES needs to be disabled on Nokia N950/N9 as well (HS devices), otherwise kernel fails to boot. Fixes: c312f066314e ("ARM: dts: omap3: Migrate AES from hwmods to sysc-omap2") Signed-off-by: Aaro Koskinen Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/omap3-n950-n9.dtsi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm/boot/dts/omap3-n950-n9.dtsi b/arch/arm/boot/dts/omap3-n950-n9.dtsi index 11d41e86f814..7dde9fbb06d3 100644 --- a/arch/arm/boot/dts/omap3-n950-n9.dtsi +++ b/arch/arm/boot/dts/omap3-n950-n9.dtsi @@ -494,3 +494,11 @@ clock-names = "sysclk"; }; }; + +&aes1_target { + status = "disabled"; +}; + +&aes2_target { + status = "disabled"; +}; From 9836720911cfec25d3fbdead1c438bf87e0f2841 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 22 Nov 2020 04:36:52 +0900 Subject: [PATCH 006/547] ARC: build: remove non-existing bootpImage from KBUILD_IMAGE The deb-pkg builds for ARCH=arc fail. $ export CROSS_COMPILE= $ make -s ARCH=arc defconfig $ make ARCH=arc bindeb-pkg SORTTAB vmlinux SYSMAP System.map MODPOST Module.symvers make KERNELRELEASE=5.10.0-rc4 ARCH=arc KBUILD_BUILD_VERSION=2 -f ./Makefile intdeb-pkg sh ./scripts/package/builddeb cp: cannot stat 'arch/arc/boot/bootpImage': No such file or directory make[4]: *** [scripts/Makefile.package:87: intdeb-pkg] Error 1 make[3]: *** [Makefile:1527: intdeb-pkg] Error 2 make[2]: *** [debian/rules:13: binary-arch] Error 2 dpkg-buildpackage: error: debian/rules binary subprocess returned exit status 2 make[1]: *** [scripts/Makefile.package:83: bindeb-pkg] Error 2 make: *** [Makefile:1527: bindeb-pkg] Error 2 The reason is obvious; arch/arc/Makefile sets $(boot)/bootpImage as the default image, but there is no rule to build it. Remove the meaningless KBUILD_IMAGE assignment so it will fallback to the default vmlinux. With this change, you can build the deb package. I removed the 'bootpImage' target as well. At best, it provides 'make bootpImage' as an alias of 'make vmlinux', but I do not see much sense in doing so. Signed-off-by: Masahiro Yamada Signed-off-by: Vineet Gupta --- arch/arc/Makefile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 0c6bf0d1df7a..acf99420e161 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -102,12 +102,6 @@ libs-y += arch/arc/lib/ $(LIBGCC) boot := arch/arc/boot -#default target for make without any arguments. -KBUILD_IMAGE := $(boot)/bootpImage - -all: bootpImage -bootpImage: vmlinux - boot_targets += uImage uImage.bin uImage.gz $(boot_targets): vmlinux From f2712ec76a5433e5ec9def2bd52a95df1f96d050 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 22 Nov 2020 04:36:53 +0900 Subject: [PATCH 007/547] ARC: build: add uImage.lzma to the top-level target arch/arc/boot/Makefile supports uImage.lzma, but you cannot do 'make uImage.lzma' because the corresponding target is missing in arch/arc/Makefile. Add it. I also changed the assignment operator '+=' to ':=' since this is the only place where we expect this variable to be set. Signed-off-by: Masahiro Yamada Signed-off-by: Vineet Gupta --- arch/arc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/Makefile b/arch/arc/Makefile index acf99420e161..61a41123ad4c 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -102,7 +102,7 @@ libs-y += arch/arc/lib/ $(LIBGCC) boot := arch/arc/boot -boot_targets += uImage uImage.bin uImage.gz +boot_targets := uImage uImage.bin uImage.gz uImage.lzma $(boot_targets): vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ From 0cfccb3c04934cdef42ae26042139f16e805b5f7 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 22 Nov 2020 04:36:54 +0900 Subject: [PATCH 008/547] ARC: build: add boot_targets to PHONY The top-level boot_targets (uImage and uImage.*) should be phony targets. They just let Kbuild descend into arch/arc/boot/ and create files there. If a file exists in the top directory with the same name, the boot image will not be created. You can confirm it by the following steps: $ export CROSS_COMPILE= $ make -s ARCH=arc defconfig all # vmlinux will be built $ touch uImage.gz $ make ARCH=arc uImage.gz CALL scripts/atomic/check-atomics.sh CALL scripts/checksyscalls.sh CHK include/generated/compile.h # arch/arc/boot/uImage.gz is not created Specify the targets as PHONY to fix this. Signed-off-by: Masahiro Yamada Signed-off-by: Vineet Gupta --- arch/arc/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 61a41123ad4c..cf9da9aea12a 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -104,6 +104,7 @@ boot := arch/arc/boot boot_targets := uImage uImage.bin uImage.gz uImage.lzma +PHONY += $(boot_targets) $(boot_targets): vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ From c5e6ae563c802c4d828d42e134af64004db2e58c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 22 Nov 2020 04:36:55 +0900 Subject: [PATCH 009/547] ARC: build: move symlink creation to arch/arc/Makefile to avoid race If you run 'make uImage uImage.gz' with the parallel option, uImage.gz will be created by two threads simultaneously. This is because arch/arc/Makefile does not specify the dependency between uImage and uImage.gz. Hence, GNU Make assumes they can be built in parallel. One thread descends into arch/arc/boot/ to create uImage, and another to create uImage.gz. Please notice the same log is displayed twice in the following steps: $ export CROSS_COMPILE= $ make -s ARCH=arc defconfig $ make -j$(nproc) ARCH=arc uImage uImage.gz [ snip ] LD vmlinux SORTTAB vmlinux SYSMAP System.map OBJCOPY arch/arc/boot/vmlinux.bin OBJCOPY arch/arc/boot/vmlinux.bin GZIP arch/arc/boot/vmlinux.bin.gz GZIP arch/arc/boot/vmlinux.bin.gz UIMAGE arch/arc/boot/uImage.gz UIMAGE arch/arc/boot/uImage.gz Image Name: Linux-5.10.0-rc4-00003-g62f23044 Created: Sun Nov 22 02:52:26 2020 Image Type: ARC Linux Kernel Image (gzip compressed) Data Size: 2109376 Bytes = 2059.94 KiB = 2.01 MiB Load Address: 80000000 Entry Point: 80004000 Image arch/arc/boot/uImage is ready Image Name: Linux-5.10.0-rc4-00003-g62f23044 Created: Sun Nov 22 02:52:26 2020 Image Type: ARC Linux Kernel Image (gzip compressed) Data Size: 2815455 Bytes = 2749.47 KiB = 2.69 MiB Load Address: 80000000 Entry Point: 80004000 This is a race between the two threads trying to write to the same file arch/arc/boot/uImage.gz. This is a potential problem that can generate a broken file. I fixed a similar problem for ARM by commit 3939f3345050 ("ARM: 8418/1: add boot image dependencies to not generate invalid images"). I highly recommend to avoid such build rules that cause a race condition. Move the uImage rule to arch/arc/Makefile. Another strangeness is that arch/arc/boot/Makefile compares the timestamps between $(obj)/uImage and $(obj)/uImage.*: $(obj)/uImage: $(obj)/uImage.$(suffix-y) @ln -sf $(notdir $<) $@ @echo ' Image $@ is ready' This does not work as expected since $(obj)/uImage is a symlink. The symlink should be created in a phony target rule. I used $(kecho) instead of echo to suppress the message 'Image arch/arc/boot/uImage is ready' when the -s option is given. Signed-off-by: Masahiro Yamada Signed-off-by: Vineet Gupta --- arch/arc/Makefile | 13 ++++++++++++- arch/arc/boot/Makefile | 11 +---------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/arc/Makefile b/arch/arc/Makefile index cf9da9aea12a..578bdbbb0fa7 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -102,11 +102,22 @@ libs-y += arch/arc/lib/ $(LIBGCC) boot := arch/arc/boot -boot_targets := uImage uImage.bin uImage.gz uImage.lzma +boot_targets := uImage.bin uImage.gz uImage.lzma PHONY += $(boot_targets) $(boot_targets): vmlinux $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ +uimage-default-y := uImage.bin +uimage-default-$(CONFIG_KERNEL_GZIP) := uImage.gz +uimage-default-$(CONFIG_KERNEL_LZMA) := uImage.lzma + +PHONY += uImage +uImage: $(uimage-default-y) + @ln -sf $< $(boot)/uImage + @$(kecho) ' Image $(boot)/uImage is ready' + +CLEAN_FILES += $(boot)/uImage + archclean: $(Q)$(MAKE) $(clean)=$(boot) diff --git a/arch/arc/boot/Makefile b/arch/arc/boot/Makefile index 538b92f4dd25..3b1f8a69a89e 100644 --- a/arch/arc/boot/Makefile +++ b/arch/arc/boot/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -targets := vmlinux.bin vmlinux.bin.gz uImage +targets := vmlinux.bin vmlinux.bin.gz # uImage build relies on mkimage being availble on your host for ARC target # You will need to build u-boot for ARC, rename mkimage to arc-elf32-mkimage @@ -13,11 +13,6 @@ LINUX_START_TEXT = $$(readelf -h vmlinux | \ UIMAGE_LOADADDR = $(CONFIG_LINUX_LINK_BASE) UIMAGE_ENTRYADDR = $(LINUX_START_TEXT) -suffix-y := bin -suffix-$(CONFIG_KERNEL_GZIP) := gz -suffix-$(CONFIG_KERNEL_LZMA) := lzma - -targets += uImage targets += uImage.bin targets += uImage.gz targets += uImage.lzma @@ -42,7 +37,3 @@ $(obj)/uImage.gz: $(obj)/vmlinux.bin.gz FORCE $(obj)/uImage.lzma: $(obj)/vmlinux.bin.lzma FORCE $(call if_changed,uimage,lzma) - -$(obj)/uImage: $(obj)/uImage.$(suffix-y) - @ln -sf $(notdir $<) $@ - @echo ' Image $@ is ready' From a4e070cfeb9d4961a169a2f1a614665cf51de963 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 22 Nov 2020 04:36:56 +0900 Subject: [PATCH 010/547] ARC: build: remove unneeded extra-y Adding vmlinux.* to extra-y has no point because we expect they are built on demand while building uImage.* Add them to 'targets' is enough to include the corresponding .cmd file. Signed-off-by: Masahiro Yamada Signed-off-by: Vineet Gupta --- arch/arc/boot/Makefile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/arc/boot/Makefile b/arch/arc/boot/Makefile index 3b1f8a69a89e..b3870cc100bf 100644 --- a/arch/arc/boot/Makefile +++ b/arch/arc/boot/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -targets := vmlinux.bin vmlinux.bin.gz # uImage build relies on mkimage being availble on your host for ARC target # You will need to build u-boot for ARC, rename mkimage to arc-elf32-mkimage @@ -13,12 +12,12 @@ LINUX_START_TEXT = $$(readelf -h vmlinux | \ UIMAGE_LOADADDR = $(CONFIG_LINUX_LINK_BASE) UIMAGE_ENTRYADDR = $(LINUX_START_TEXT) +targets += vmlinux.bin +targets += vmlinux.bin.gz +targets += vmlinux.bin.lzma targets += uImage.bin targets += uImage.gz targets += uImage.lzma -extra-y += vmlinux.bin -extra-y += vmlinux.bin.gz -extra-y += vmlinux.bin.lzma $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) From 3a71e423133a4b1166ffafcb4a7cfa87ddecb910 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 22 Nov 2020 04:36:57 +0900 Subject: [PATCH 011/547] ARC: build: use $(READELF) instead of hard-coded readelf The top Makefile defines READELF as the readelf in the cross-toolchains. Use it rather than the host readelf. Signed-off-by: Masahiro Yamada Signed-off-by: Vineet Gupta --- arch/arc/boot/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/boot/Makefile b/arch/arc/boot/Makefile index b3870cc100bf..5648748c285f 100644 --- a/arch/arc/boot/Makefile +++ b/arch/arc/boot/Makefile @@ -6,7 +6,7 @@ OBJCOPYFLAGS= -O binary -R .note -R .note.gnu.build-id -R .comment -S -LINUX_START_TEXT = $$(readelf -h vmlinux | \ +LINUX_START_TEXT = $$($(READELF) -h vmlinux | \ grep "Entry point address" | grep -o 0x.*) UIMAGE_LOADADDR = $(CONFIG_LINUX_LINK_BASE) From 1967939462641d8b36bcb3fcf06d48e66cd67a4f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 29 Nov 2020 04:33:35 +0900 Subject: [PATCH 012/547] Compiler Attributes: remove CONFIG_ENABLE_MUST_CHECK Revert commit cebc04ba9aeb ("add CONFIG_ENABLE_MUST_CHECK"). A lot of warn_unused_result warnings existed in 2006, but until now they have been fixed thanks to people doing allmodconfig tests. Our goal is to always enable __must_check where appropriate, so this CONFIG option is no longer needed. I see a lot of defconfig (arch/*/configs/*_defconfig) files having: # CONFIG_ENABLE_MUST_CHECK is not set I did not touch them for now since it would be a big churn. If arch maintainers want to clean them up, please go ahead. While I was here, I also moved __must_check to compiler_attributes.h from compiler_types.h Signed-off-by: Masahiro Yamada Acked-by: Jason A. Donenfeld Acked-by: Nathan Chancellor Reviewed-by: Nick Desaulniers [Moved addition in compiler_attributes.h to keep it sorted] Signed-off-by: Miguel Ojeda --- include/linux/compiler_attributes.h | 6 ++++++ include/linux/compiler_types.h | 6 ------ lib/Kconfig.debug | 8 -------- tools/testing/selftests/wireguard/qemu/debug.config | 1 - 4 files changed, 6 insertions(+), 15 deletions(-) diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index b2a3f4f641a7..ea5e04e75845 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -272,6 +272,12 @@ */ #define __used __attribute__((__used__)) +/* + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-warn_005funused_005fresult-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#nodiscard-warn-unused-result + */ +#define __must_check __attribute__((__warn_unused_result__)) + /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-weak-function-attribute * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-weak-variable-attribute diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index ac3fa37a84f9..7ef20d1a6c28 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -110,12 +110,6 @@ struct ftrace_likely_data { unsigned long constant; }; -#ifdef CONFIG_ENABLE_MUST_CHECK -#define __must_check __attribute__((__warn_unused_result__)) -#else -#define __must_check -#endif - #if defined(CC_USING_HOTPATCH) #define notrace __attribute__((hotpatch(0, 0))) #elif defined(CC_USING_PATCHABLE_FUNCTION_ENTRY) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c789b39ed527..cb8ef4fd0d02 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -286,14 +286,6 @@ config GDB_SCRIPTS endif # DEBUG_INFO -config ENABLE_MUST_CHECK - bool "Enable __must_check logic" - default y - help - Enable the __must_check logic in the kernel build. Disable this to - suppress the "warning: ignoring return value of 'foo', declared with - attribute warn_unused_result" messages. - config FRAME_WARN int "Warn for stack frames larger than" range 0 8192 diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index b50c2085c1ac..fe07d97df9fa 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -1,5 +1,4 @@ CONFIG_LOCALVERSION="-debug" -CONFIG_ENABLE_MUST_CHECK=y CONFIG_FRAME_POINTER=y CONFIG_STACK_VALIDATION=y CONFIG_DEBUG_KERNEL=y From ec76c2eea903947202098090bbe07a739b5246e9 Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Fri, 4 Dec 2020 10:55:39 +0100 Subject: [PATCH 013/547] ARM: OMAP2+: omap_device: fix idling of devices during probe On the GTA04A5 od->_driver_status was not set to BUS_NOTIFY_BIND_DRIVER during probe of the second mmc used for wifi. Therefore omap_device_late_idle idled the device during probing causing oopses when accessing the registers. It was not set because od->_state was set to OMAP_DEVICE_STATE_IDLE in the notifier callback. Therefore set od->_driver_status also in that case. This came apparent after commit 21b2cec61c04 ("mmc: Set PROBE_PREFER_ASYNCHRONOUS for drivers that existed in v4.4") causing this oops: omap_hsmmc 480b4000.mmc: omap_device_late_idle: enabled but no driver. Idling 8<--- cut here --- Unhandled fault: external abort on non-linefetch (0x1028) at 0xfa0b402c ... (omap_hsmmc_set_bus_width) from [] (omap_hsmmc_set_ios+0x11c/0x258) (omap_hsmmc_set_ios) from [] (mmc_power_up.part.8+0x3c/0xd0) (mmc_power_up.part.8) from [] (mmc_start_host+0x88/0x9c) (mmc_start_host) from [] (mmc_add_host+0x58/0x84) (mmc_add_host) from [] (omap_hsmmc_probe+0x5fc/0x8c0) (omap_hsmmc_probe) from [] (platform_drv_probe+0x48/0x98) (platform_drv_probe) from [] (really_probe+0x1dc/0x3b4) Fixes: 04abaf07f6d5 ("ARM: OMAP2+: omap_device: Sync omap_device and pm_runtime after probe defer") Fixes: 21b2cec61c04 ("mmc: Set PROBE_PREFER_ASYNCHRONOUS for drivers that existed in v4.4") Acked-by: Ulf Hansson Signed-off-by: Andreas Kemnade [tony@atomide.com: left out extra parens, trimmed description stack trace] Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/omap_device.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm/mach-omap2/omap_device.c b/arch/arm/mach-omap2/omap_device.c index fc7bb2ca1672..64b23b0cd23c 100644 --- a/arch/arm/mach-omap2/omap_device.c +++ b/arch/arm/mach-omap2/omap_device.c @@ -230,10 +230,12 @@ static int _omap_device_notifier_call(struct notifier_block *nb, break; case BUS_NOTIFY_BIND_DRIVER: od = to_omap_device(pdev); - if (od && (od->_state == OMAP_DEVICE_STATE_ENABLED) && - pm_runtime_status_suspended(dev)) { + if (od) { od->_driver_status = BUS_NOTIFY_BIND_DRIVER; - pm_runtime_set_active(dev); + if (od->_state == OMAP_DEVICE_STATE_ENABLED && + pm_runtime_status_suspended(dev)) { + pm_runtime_set_active(dev); + } } break; case BUS_NOTIFY_ADD_DEVICE: From 2f6fc9e08bf79f11516edef855283c6212bbe78f Mon Sep 17 00:00:00 2001 From: Andreas Kemnade Date: Tue, 1 Dec 2020 20:12:37 +0100 Subject: [PATCH 014/547] ARM: omap2plus_defconfig: enable SPI GPIO GTA04 uses that for controlling the td028ttec1 panel. So for easier testing/bisecting it is useful to have it enabled in the defconfig. Signed-off-by: Andreas Kemnade Signed-off-by: Tony Lindgren --- arch/arm/configs/omap2plus_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 23b53526c4eb..28add0b64d42 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -280,6 +280,7 @@ CONFIG_SERIAL_OMAP_CONSOLE=y CONFIG_SERIAL_DEV_BUS=y CONFIG_I2C_CHARDEV=y CONFIG_SPI=y +CONFIG_SPI_GPIO=m CONFIG_SPI_OMAP24XX=y CONFIG_SPI_TI_QSPI=m CONFIG_HSI=m From c0bc969c176b10598b31d5d1a5edf9a5261f0a9f Mon Sep 17 00:00:00 2001 From: Carl Philipp Klemm Date: Mon, 7 Dec 2020 20:58:01 +0100 Subject: [PATCH 015/547] ARM: omap2: pmic-cpcap: fix maximum voltage to be consistent with defaults on xt875 xt875 comes up with a iva voltage of 1375000 and android runs at this too. fix maximum voltage to be consistent with this. Signed-off-by: Carl Philipp Klemm Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/pmic-cpcap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-omap2/pmic-cpcap.c b/arch/arm/mach-omap2/pmic-cpcap.c index eab281a5fc9f..09076ad0576d 100644 --- a/arch/arm/mach-omap2/pmic-cpcap.c +++ b/arch/arm/mach-omap2/pmic-cpcap.c @@ -71,7 +71,7 @@ static struct omap_voltdm_pmic omap_cpcap_iva = { .vp_vstepmin = OMAP4_VP_VSTEPMIN_VSTEPMIN, .vp_vstepmax = OMAP4_VP_VSTEPMAX_VSTEPMAX, .vddmin = 900000, - .vddmax = 1350000, + .vddmax = 1375000, .vp_timeout_us = OMAP4_VP_VLIMITTO_TIMEOUT_US, .i2c_slave_addr = 0x44, .volt_reg_addr = 0x0, From 44fd9fb599d3d2be4c6838f4b11eaa459bb33989 Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Mon, 7 Dec 2020 20:01:36 +0100 Subject: [PATCH 016/547] scsi: ufs: Remove unused macro definition POWER_DESC_MAX_SIZE POWER_DESC_MAX_SIZE is unused, remove it. Link: https://lore.kernel.org/r/20201207190137.6858-2-huobean@gmail.com Acked-by: Avri Altman Acked-by: Alim Akhtar Signed-off-by: Bean Huo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/ufs/ufs.h b/drivers/scsi/ufs/ufs.h index d593edb48767..142c2ebd3d7d 100644 --- a/drivers/scsi/ufs/ufs.h +++ b/drivers/scsi/ufs/ufs.h @@ -330,7 +330,6 @@ enum { UFS_DEV_WRITE_BOOSTER_SUP = BIT(8), }; -#define POWER_DESC_MAX_SIZE 0x62 #define POWER_DESC_MAX_ACTV_ICC_LVLS 16 /* Attribute bActiveICCLevel parameter bit masks definitions */ From 1fa0570002e3f66db9b58c32c60de4183b857a19 Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Mon, 7 Dec 2020 20:01:37 +0100 Subject: [PATCH 017/547] scsi: ufs: Fix wrong print message in dev_err() Change dev_err() print message from "dme-reset" to "dme_enable" in function ufshcd_dme_enable(). Link: https://lore.kernel.org/r/20201207190137.6858-3-huobean@gmail.com Acked-by: Alim Akhtar Acked-by: Avri Altman Signed-off-by: Bean Huo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index c1c401b2b69d..6bcedebae4a8 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -3641,7 +3641,7 @@ static int ufshcd_dme_enable(struct ufs_hba *hba) ret = ufshcd_send_uic_cmd(hba, &uic_cmd); if (ret) dev_err(hba->dev, - "dme-reset: error code %d\n", ret); + "dme-enable: error code %d\n", ret); return ret; } From 1918651f2d7e8d58c9b7c49755c61e41ed655009 Mon Sep 17 00:00:00 2001 From: Randall Huang Date: Mon, 30 Nov 2020 20:14:02 -0800 Subject: [PATCH 018/547] scsi: ufs: Clear UAC for RPMB after ufshcd resets If RPMB is not provisioned, we may see RPMB failure after UFS suspend/resume. Inject request_sense to clear uac in ufshcd reset flow. Link: https://lore.kernel.org/r/20201201041402.3860525-1-jaegeuk@kernel.org Reported-by: kernel test robot Reviewed-by: Stanley Chu Signed-off-by: Randall Huang Signed-off-by: Leo Liou Signed-off-by: Jaegeuk Kim Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 6bcedebae4a8..a84a8759437a 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -225,6 +225,7 @@ static int ufshcd_reset_and_restore(struct ufs_hba *hba); static int ufshcd_eh_host_reset_handler(struct scsi_cmnd *cmd); static int ufshcd_clear_tm_cmd(struct ufs_hba *hba, int tag); static void ufshcd_hba_exit(struct ufs_hba *hba); +static int ufshcd_clear_ua_wluns(struct ufs_hba *hba); static int ufshcd_probe_hba(struct ufs_hba *hba, bool async); static int ufshcd_setup_clocks(struct ufs_hba *hba, bool on); static int ufshcd_uic_hibern8_enter(struct ufs_hba *hba); @@ -6895,7 +6896,8 @@ static int ufshcd_host_reset_and_restore(struct ufs_hba *hba) /* Establish the link again and restore the device */ err = ufshcd_probe_hba(hba, false); - + if (!err) + ufshcd_clear_ua_wluns(hba); out: if (err) dev_err(hba->dev, "%s: Host init failed %d\n", __func__, err); @@ -8379,13 +8381,7 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, * handling context. */ hba->host->eh_noresume = 1; - if (hba->wlun_dev_clr_ua) { - ret = ufshcd_send_request_sense(hba, sdp); - if (ret) - goto out; - /* Unit attention condition is cleared now */ - hba->wlun_dev_clr_ua = false; - } + ufshcd_clear_ua_wluns(hba); cmd[4] = pwr_mode << 4; @@ -8406,7 +8402,7 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, if (!ret) hba->curr_dev_pwr_mode = pwr_mode; -out: + scsi_device_put(sdp); hba->host->eh_noresume = 0; return ret; From f8162ac70ecf5a3ed638f96dc10e0e19b523ec7f Mon Sep 17 00:00:00 2001 From: Stanley Chu Date: Mon, 7 Dec 2020 13:49:54 +0800 Subject: [PATCH 019/547] scsi: ufs: Allow regulators being always-on Introduce a flag "always_on" in struct ufs_vreg to allow vendors to keep the regulator always-on. Link: https://lore.kernel.org/r/20201207054955.24366-2-stanley.chu@mediatek.com Reviewed-by: Andy Teng Signed-off-by: Stanley Chu Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufs.h | 1 + drivers/scsi/ufs/ufshcd.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufs.h b/drivers/scsi/ufs/ufs.h index 142c2ebd3d7d..14dfda735adf 100644 --- a/drivers/scsi/ufs/ufs.h +++ b/drivers/scsi/ufs/ufs.h @@ -512,6 +512,7 @@ struct ufs_query_res { struct ufs_vreg { struct regulator *reg; const char *name; + bool always_on; bool enabled; int min_uV; int max_uV; diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index a84a8759437a..4f3ac2566322 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8012,7 +8012,7 @@ static int ufshcd_disable_vreg(struct device *dev, struct ufs_vreg *vreg) { int ret = 0; - if (!vreg || !vreg->enabled) + if (!vreg || !vreg->enabled || vreg->always_on) goto out; ret = regulator_disable(vreg->reg); From b3f3d31a528f78d9903253a23a5e5c6bf5280f40 Mon Sep 17 00:00:00 2001 From: Stanley Chu Date: Mon, 7 Dec 2020 13:49:55 +0800 Subject: [PATCH 020/547] scsi: ufs-mediatek: Keep VCC always-on for specific devices For some devices which need extra delay after VCC power down, VCC shall be kept always-on in some MediaTek UFS platforms to ensure the stability of such devices because the extra delay may not be enough in those platforms. Link: https://lore.kernel.org/r/20201207054955.24366-3-stanley.chu@mediatek.com Reviewed-by: Andy Teng Signed-off-by: Stanley Chu Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufs-mediatek.c | 21 +++++++++++++++++++++ drivers/scsi/ufs/ufs-mediatek.h | 1 + 2 files changed, 22 insertions(+) diff --git a/drivers/scsi/ufs/ufs-mediatek.c b/drivers/scsi/ufs/ufs-mediatek.c index 3522458db3bb..80618af7c872 100644 --- a/drivers/scsi/ufs/ufs-mediatek.c +++ b/drivers/scsi/ufs/ufs-mediatek.c @@ -70,6 +70,13 @@ static bool ufs_mtk_is_va09_supported(struct ufs_hba *hba) return !!(host->caps & UFS_MTK_CAP_VA09_PWR_CTRL); } +static bool ufs_mtk_is_broken_vcc(struct ufs_hba *hba) +{ + struct ufs_mtk_host *host = ufshcd_get_variant(hba); + + return !!(host->caps & UFS_MTK_CAP_BROKEN_VCC); +} + static void ufs_mtk_cfg_unipro_cg(struct ufs_hba *hba, bool enable) { u32 tmp; @@ -514,6 +521,9 @@ static void ufs_mtk_init_host_caps(struct ufs_hba *hba) if (of_property_read_bool(np, "mediatek,ufs-disable-ah8")) host->caps |= UFS_MTK_CAP_DISABLE_AH8; + if (of_property_read_bool(np, "mediatek,ufs-broken-vcc")) + host->caps |= UFS_MTK_CAP_BROKEN_VCC; + dev_info(hba->dev, "caps: 0x%x", host->caps); } @@ -1003,6 +1013,17 @@ static int ufs_mtk_apply_dev_quirks(struct ufs_hba *hba) static void ufs_mtk_fixup_dev_quirks(struct ufs_hba *hba) { ufshcd_fixup_dev_quirks(hba, ufs_mtk_dev_fixups); + + if (ufs_mtk_is_broken_vcc(hba) && hba->vreg_info.vcc && + (hba->dev_quirks & UFS_DEVICE_QUIRK_DELAY_AFTER_LPM)) { + hba->vreg_info.vcc->always_on = true; + /* + * VCC will be kept always-on thus we don't + * need any delay during regulator operations + */ + hba->dev_quirks &= ~(UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM | + UFS_DEVICE_QUIRK_DELAY_AFTER_LPM); + } } static void ufs_mtk_event_notify(struct ufs_hba *hba, diff --git a/drivers/scsi/ufs/ufs-mediatek.h b/drivers/scsi/ufs/ufs-mediatek.h index 93d35097dfb0..3f0d3bb769e8 100644 --- a/drivers/scsi/ufs/ufs-mediatek.h +++ b/drivers/scsi/ufs/ufs-mediatek.h @@ -81,6 +81,7 @@ enum ufs_mtk_host_caps { UFS_MTK_CAP_BOOST_CRYPT_ENGINE = 1 << 0, UFS_MTK_CAP_VA09_PWR_CTRL = 1 << 1, UFS_MTK_CAP_DISABLE_AH8 = 1 << 2, + UFS_MTK_CAP_BROKEN_VCC = 1 << 3, }; struct ufs_mtk_crypt_cfg { From c763729a10e538d997744317cf4a1c4f25266066 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 7 Dec 2020 10:31:17 +0200 Subject: [PATCH 021/547] scsi: ufs-pci: Fix restore from S4 for Intel controllers Currently, ufshcd-pci is the only UFS driver with support for suspend-to-disk PM callbacks (i.e. freeze/thaw/restore/poweroff). These callbacks are set by the macro SET_SYSTEM_SLEEP_PM_OPS to the same functions as system suspend/resume. That will work with spm_lvl 5 because spm_lvl 5 will result in a full restore for the ->restore() callback. In the absence of a full restore, the host controller registers will have values set up by the restore kernel (the kernel that boots and loads the restore image) which are not necessarily the same. However it turns out, the only registers that sometimes need restore are the base address registers. This has gone un-noticed because, depending on IOMMU settings, the kernel can end up allocating the same addresses every time. For Intel controllers, an spm_lvl other than 5 can be used, so to support S4 (suspend-to-disk) with spm_lvl other than 5, restore the base address registers. Link: https://lore.kernel.org/r/20201207083120.26732-2-adrian.hunter@intel.com Signed-off-by: Adrian Hunter Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd-pci.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd-pci.c b/drivers/scsi/ufs/ufshcd-pci.c index df3a564c3e33..360c25f1f061 100644 --- a/drivers/scsi/ufs/ufshcd-pci.c +++ b/drivers/scsi/ufs/ufshcd-pci.c @@ -163,6 +163,24 @@ static void ufs_intel_common_exit(struct ufs_hba *hba) intel_ltr_hide(hba->dev); } +static int ufs_intel_resume(struct ufs_hba *hba, enum ufs_pm_op op) +{ + /* + * To support S4 (suspend-to-disk) with spm_lvl other than 5, the base + * address registers must be restored because the restore kernel can + * have used different addresses. + */ + ufshcd_writel(hba, lower_32_bits(hba->utrdl_dma_addr), + REG_UTP_TRANSFER_REQ_LIST_BASE_L); + ufshcd_writel(hba, upper_32_bits(hba->utrdl_dma_addr), + REG_UTP_TRANSFER_REQ_LIST_BASE_H); + ufshcd_writel(hba, lower_32_bits(hba->utmrdl_dma_addr), + REG_UTP_TASK_REQ_LIST_BASE_L); + ufshcd_writel(hba, upper_32_bits(hba->utmrdl_dma_addr), + REG_UTP_TASK_REQ_LIST_BASE_H); + return 0; +} + static int ufs_intel_ehl_init(struct ufs_hba *hba) { hba->quirks |= UFSHCD_QUIRK_BROKEN_AUTO_HIBERN8; @@ -174,6 +192,7 @@ static struct ufs_hba_variant_ops ufs_intel_cnl_hba_vops = { .init = ufs_intel_common_init, .exit = ufs_intel_common_exit, .link_startup_notify = ufs_intel_link_startup_notify, + .resume = ufs_intel_resume, }; static struct ufs_hba_variant_ops ufs_intel_ehl_hba_vops = { @@ -181,6 +200,7 @@ static struct ufs_hba_variant_ops ufs_intel_ehl_hba_vops = { .init = ufs_intel_ehl_init, .exit = ufs_intel_common_exit, .link_startup_notify = ufs_intel_link_startup_notify, + .resume = ufs_intel_resume, }; #ifdef CONFIG_PM_SLEEP From af423534d2de86cd0db729a5ac41f056ca8717de Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 7 Dec 2020 10:31:18 +0200 Subject: [PATCH 022/547] scsi: ufs-pci: Ensure UFS device is in PowerDown mode for suspend-to-disk ->poweroff() The expectation for suspend-to-disk is that devices will be powered-off, so the UFS device should be put in PowerDown mode. If spm_lvl is not 5, then that will not happen. Change the pm callbacks to force spm_lvl 5 for suspend-to-disk poweroff. Link: https://lore.kernel.org/r/20201207083120.26732-3-adrian.hunter@intel.com Signed-off-by: Adrian Hunter Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd-pci.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd-pci.c b/drivers/scsi/ufs/ufshcd-pci.c index 360c25f1f061..5d33c39fa82f 100644 --- a/drivers/scsi/ufs/ufshcd-pci.c +++ b/drivers/scsi/ufs/ufshcd-pci.c @@ -227,6 +227,30 @@ static int ufshcd_pci_resume(struct device *dev) { return ufshcd_system_resume(dev_get_drvdata(dev)); } + +/** + * ufshcd_pci_poweroff - suspend-to-disk poweroff function + * @dev: pointer to PCI device handle + * + * Returns 0 if successful + * Returns non-zero otherwise + */ +static int ufshcd_pci_poweroff(struct device *dev) +{ + struct ufs_hba *hba = dev_get_drvdata(dev); + int spm_lvl = hba->spm_lvl; + int ret; + + /* + * For poweroff we need to set the UFS device to PowerDown mode. + * Force spm_lvl to ensure that. + */ + hba->spm_lvl = 5; + ret = ufshcd_system_suspend(hba); + hba->spm_lvl = spm_lvl; + return ret; +} + #endif /* !CONFIG_PM_SLEEP */ #ifdef CONFIG_PM @@ -322,8 +346,14 @@ ufshcd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) } static const struct dev_pm_ops ufshcd_pci_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(ufshcd_pci_suspend, - ufshcd_pci_resume) +#ifdef CONFIG_PM_SLEEP + .suspend = ufshcd_pci_suspend, + .resume = ufshcd_pci_resume, + .freeze = ufshcd_pci_suspend, + .thaw = ufshcd_pci_resume, + .poweroff = ufshcd_pci_poweroff, + .restore = ufshcd_pci_resume, +#endif SET_RUNTIME_PM_OPS(ufshcd_pci_runtime_suspend, ufshcd_pci_runtime_resume, ufshcd_pci_runtime_idle) From 044d5bda7117891d6d0d56f2f807b7b11e120abd Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 7 Dec 2020 10:31:19 +0200 Subject: [PATCH 023/547] scsi: ufs-pci: Fix recovery from hibernate exit errors for Intel controllers Intel controllers can end up in an unrecoverable state after a hibernate exit error unless a full reset and restore is done before anything else. Force that to happen. Link: https://lore.kernel.org/r/20201207083120.26732-4-adrian.hunter@intel.com Signed-off-by: Adrian Hunter Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd-pci.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd-pci.c b/drivers/scsi/ufs/ufshcd-pci.c index 5d33c39fa82f..888d8c9ca3a5 100644 --- a/drivers/scsi/ufs/ufshcd-pci.c +++ b/drivers/scsi/ufs/ufshcd-pci.c @@ -178,6 +178,23 @@ static int ufs_intel_resume(struct ufs_hba *hba, enum ufs_pm_op op) REG_UTP_TASK_REQ_LIST_BASE_L); ufshcd_writel(hba, upper_32_bits(hba->utmrdl_dma_addr), REG_UTP_TASK_REQ_LIST_BASE_H); + + if (ufshcd_is_link_hibern8(hba)) { + int ret = ufshcd_uic_hibern8_exit(hba); + + if (!ret) { + ufshcd_set_link_active(hba); + } else { + dev_err(hba->dev, "%s: hibern8 exit failed %d\n", + __func__, ret); + /* + * Force reset and restore. Any other actions can lead + * to an unrecoverable state. + */ + ufshcd_set_link_off(hba); + } + } + return 0; } From dd78bdb6f810bdcb173b42379af558c676c8e0aa Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 7 Dec 2020 10:31:20 +0200 Subject: [PATCH 024/547] scsi: ufs-pci: Enable UFSHCD_CAP_RPM_AUTOSUSPEND for Intel controllers Enable runtime PM auto-suspend by default for Intel host controllers. Link: https://lore.kernel.org/r/20201207083120.26732-5-adrian.hunter@intel.com Signed-off-by: Adrian Hunter Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd-pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd-pci.c b/drivers/scsi/ufs/ufshcd-pci.c index 888d8c9ca3a5..fadd566025b8 100644 --- a/drivers/scsi/ufs/ufshcd-pci.c +++ b/drivers/scsi/ufs/ufshcd-pci.c @@ -148,6 +148,8 @@ static int ufs_intel_common_init(struct ufs_hba *hba) { struct intel_host *host; + hba->caps |= UFSHCD_CAP_RPM_AUTOSUSPEND; + host = devm_kzalloc(hba->dev, sizeof(*host), GFP_KERNEL); if (!host) return -ENOMEM; From fa4d0f1992a96f6d7c988ef423e3127e613f6ac9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 8 Dec 2020 21:29:44 -0800 Subject: [PATCH 025/547] scsi: block: Fix a race in the runtime power management code With the current implementation the following race can happen: * blk_pre_runtime_suspend() calls blk_freeze_queue_start() and blk_mq_unfreeze_queue(). * blk_queue_enter() calls blk_queue_pm_only() and that function returns true. * blk_queue_enter() calls blk_pm_request_resume() and that function does not call pm_request_resume() because the queue runtime status is RPM_ACTIVE. * blk_pre_runtime_suspend() changes the queue status into RPM_SUSPENDING. Fix this race by changing the queue runtime status into RPM_SUSPENDING before switching q_usage_counter to atomic mode. Link: https://lore.kernel.org/r/20201209052951.16136-2-bvanassche@acm.org Fixes: 986d413b7c15 ("blk-mq: Enable support for runtime power management") Cc: Ming Lei Cc: Rafael J. Wysocki Cc: stable Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Jens Axboe Acked-by: Alan Stern Acked-by: Stanley Chu Co-developed-by: Can Guo Signed-off-by: Can Guo Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- block/blk-pm.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/block/blk-pm.c b/block/blk-pm.c index b85234d758f7..17bd020268d4 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -67,6 +67,10 @@ int blk_pre_runtime_suspend(struct request_queue *q) WARN_ON_ONCE(q->rpm_status != RPM_ACTIVE); + spin_lock_irq(&q->queue_lock); + q->rpm_status = RPM_SUSPENDING; + spin_unlock_irq(&q->queue_lock); + /* * Increase the pm_only counter before checking whether any * non-PM blk_queue_enter() calls are in progress to avoid that any @@ -89,15 +93,14 @@ int blk_pre_runtime_suspend(struct request_queue *q) /* Switch q_usage_counter back to per-cpu mode. */ blk_mq_unfreeze_queue(q); - spin_lock_irq(&q->queue_lock); - if (ret < 0) + if (ret < 0) { + spin_lock_irq(&q->queue_lock); + q->rpm_status = RPM_ACTIVE; pm_runtime_mark_last_busy(q->dev); - else - q->rpm_status = RPM_SUSPENDING; - spin_unlock_irq(&q->queue_lock); + spin_unlock_irq(&q->queue_lock); - if (ret) blk_clear_pm_only(q); + } return ret; } From 0854bcdcdec26aecdc92c303816f349ee1fba2bc Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 8 Dec 2020 21:29:45 -0800 Subject: [PATCH 026/547] scsi: block: Introduce BLK_MQ_REQ_PM Introduce the BLK_MQ_REQ_PM flag. This flag makes the request allocation functions set RQF_PM. This is the first step towards removing BLK_MQ_REQ_PREEMPT. Link: https://lore.kernel.org/r/20201209052951.16136-3-bvanassche@acm.org Cc: Alan Stern Cc: Stanley Chu Cc: Ming Lei Cc: Rafael J. Wysocki Cc: Can Guo Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Jens Axboe Reviewed-by: Can Guo Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- block/blk-core.c | 7 ++++--- block/blk-mq.c | 2 ++ include/linux/blk-mq.h | 2 ++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 2db8bda43b6e..10696f9fb6ac 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -424,11 +424,11 @@ EXPORT_SYMBOL(blk_cleanup_queue); /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer - * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT + * @flags: BLK_MQ_REQ_NOWAIT, BLK_MQ_REQ_PM and/or BLK_MQ_REQ_PREEMPT */ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { - const bool pm = flags & BLK_MQ_REQ_PREEMPT; + const bool pm = flags & (BLK_MQ_REQ_PM | BLK_MQ_REQ_PREEMPT); while (true) { bool success = false; @@ -630,7 +630,8 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op, struct request *req; WARN_ON_ONCE(op & REQ_NOWAIT); - WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); + WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM | + BLK_MQ_REQ_PREEMPT)); req = blk_mq_alloc_request(q, op, flags); if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) diff --git a/block/blk-mq.c b/block/blk-mq.c index 1b25ec2fe9be..b5880a1fb38d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -292,6 +292,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->mq_hctx = data->hctx; rq->rq_flags = 0; rq->cmd_flags = data->cmd_flags; + if (data->flags & BLK_MQ_REQ_PM) + rq->rq_flags |= RQF_PM; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; if (blk_queue_io_stat(data->q)) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b23eeca4d677..c00e856c6fb1 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -444,6 +444,8 @@ enum { BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), /* allocate from reserved pool */ BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), + /* set RQF_PM */ + BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), /* set RQF_PREEMPT */ BLK_MQ_REQ_PREEMPT = (__force blk_mq_req_flags_t)(1 << 3), }; From 96d86e6a80a3ab9aff81d12f9f1f2a0da2917d38 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 8 Dec 2020 21:29:46 -0800 Subject: [PATCH 027/547] scsi: ide: Do not set the RQF_PREEMPT flag for sense requests RQF_PREEMPT is used for two different purposes in the legacy IDE code: 1. To mark power management requests. 2. To mark requests that should preempt another request. An (old) explanation of that feature is as follows: "The IDE driver in the Linux kernel normally uses a series of busywait delays during its initialization. When the driver executes these busywaits, the kernel does nothing for the duration of the wait. The time spent in these waits could be used for other initialization activities, if they could be run concurrently with these waits. More specifically, busywait-style delays such as udelay() in module init functions inhibit kernel preemption because the Big Kernel Lock is held, while yielding APIs such as schedule_timeout() allow preemption. This is true because the kernel handles the BKL specially and releases and reacquires it across reschedules allowed by the current thread. This IDE-preempt specification requires that the driver eliminate these busywaits and replace them with a mechanism that allows other work to proceed while the IDE driver is initializing." Since I haven't found an implementation of (2), do not set the PREEMPT flag for sense requests. This patch causes sense requests to be postponed while a drive is suspended instead of being submitted to ide_queue_rq(). If it would ever be necessary to restore the IDE PREEMPT functionality, that can be done by introducing a new flag in struct ide_request. Link: https://lore.kernel.org/r/20201209052951.16136-4-bvanassche@acm.org Cc: David S. Miller Cc: Alan Stern Cc: Can Guo Cc: Stanley Chu Cc: Ming Lei Cc: Rafael J. Wysocki Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Jens Axboe Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ide/ide-atapi.c | 1 - drivers/ide/ide-io.c | 5 ----- 2 files changed, 6 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 2162bc80f09e..013ad33fbbc8 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -223,7 +223,6 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) sense_rq->rq_disk = rq->rq_disk; sense_rq->cmd_flags = REQ_OP_DRV_IN; ide_req(sense_rq)->type = ATA_PRIV_SENSE; - sense_rq->rq_flags |= RQF_PREEMPT; req->cmd[0] = GPCMD_REQUEST_SENSE; req->cmd[4] = cmd_len; diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 1a53c7a75224..c210ea3bd02f 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -515,11 +515,6 @@ repeat: * above to return us whatever is in the queue. Since we call * ide_do_request() ourselves, we end up taking requests while * the queue is blocked... - * - * We let requests forced at head of queue with ide-preempt - * though. I hope that doesn't happen too much, hopefully not - * unless the subdriver triggers such a thing in its own PM - * state machine. */ if ((drive->dev_flags & IDE_DFLAG_BLOCKED) && ata_pm_request(rq) == 0 && From 5ae65383fc7633e0247c31b0c8bf0e6ea63b95a3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 8 Dec 2020 21:29:47 -0800 Subject: [PATCH 028/547] scsi: ide: Mark power management requests with RQF_PM instead of RQF_PREEMPT This is another step that prepares for the removal of RQF_PREEMPT. Link: https://lore.kernel.org/r/20201209052951.16136-5-bvanassche@acm.org Cc: David S. Miller Cc: Alan Stern Cc: Can Guo Cc: Stanley Chu Cc: Ming Lei Cc: Rafael J. Wysocki Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Jens Axboe Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ide/ide-io.c | 2 +- drivers/ide/ide-pm.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index c210ea3bd02f..4867b67b60d6 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -518,7 +518,7 @@ repeat: */ if ((drive->dev_flags & IDE_DFLAG_BLOCKED) && ata_pm_request(rq) == 0 && - (rq->rq_flags & RQF_PREEMPT) == 0) { + (rq->rq_flags & RQF_PM) == 0) { /* there should be no pending command at this point */ ide_unlock_port(hwif); goto plug_device; diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 192e6c65d34e..82ab308f1aaf 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -77,7 +77,7 @@ int generic_ide_resume(struct device *dev) } memset(&rqpm, 0, sizeof(rqpm)); - rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PREEMPT); + rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PM); ide_req(rq)->type = ATA_PRIV_PM_RESUME; ide_req(rq)->special = &rqpm; rqpm.pm_step = IDE_PM_START_RESUME; From cfefd9f8240a7b9fdd96fcd54cb029870b6d8d88 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 8 Dec 2020 21:29:48 -0800 Subject: [PATCH 029/547] scsi: scsi_transport_spi: Set RQF_PM for domain validation commands Disable runtime power management during domain validation. Since a later patch removes RQF_PREEMPT, set RQF_PM for domain validation commands such that these are executed in the quiesced SCSI device state. Link: https://lore.kernel.org/r/20201209052951.16136-6-bvanassche@acm.org Cc: Alan Stern Cc: James Bottomley Cc: Woody Suwalski Cc: Can Guo Cc: Stanley Chu Cc: Ming Lei Cc: Rafael J. Wysocki Cc: Stan Johnson Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Reviewed-by: Hannes Reinecke Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_transport_spi.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/scsi_transport_spi.c b/drivers/scsi/scsi_transport_spi.c index f3d5b1bbd5aa..c37dd15d16d2 100644 --- a/drivers/scsi/scsi_transport_spi.c +++ b/drivers/scsi/scsi_transport_spi.c @@ -117,12 +117,16 @@ static int spi_execute(struct scsi_device *sdev, const void *cmd, sshdr = &sshdr_tmp; for(i = 0; i < DV_RETRIES; i++) { + /* + * The purpose of the RQF_PM flag below is to bypass the + * SDEV_QUIESCE state. + */ result = scsi_execute(sdev, cmd, dir, buffer, bufflen, sense, sshdr, DV_TIMEOUT, /* retries */ 1, REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER, - 0, NULL); + RQF_PM, NULL); if (driver_byte(result) != DRIVER_SENSE || sshdr->sense_key != UNIT_ATTENTION) break; @@ -1005,23 +1009,26 @@ spi_dv_device(struct scsi_device *sdev) */ lock_system_sleep(); + if (scsi_autopm_get_device(sdev)) + goto unlock_system_sleep; + if (unlikely(spi_dv_in_progress(starget))) - goto unlock; + goto put_autopm; if (unlikely(scsi_device_get(sdev))) - goto unlock; + goto put_autopm; spi_dv_in_progress(starget) = 1; buffer = kzalloc(len, GFP_KERNEL); if (unlikely(!buffer)) - goto out_put; + goto put_sdev; /* We need to verify that the actual device will quiesce; the * later target quiesce is just a nice to have */ if (unlikely(scsi_device_quiesce(sdev))) - goto out_free; + goto free_buffer; scsi_target_quiesce(starget); @@ -1041,12 +1048,16 @@ spi_dv_device(struct scsi_device *sdev) spi_initial_dv(starget) = 1; - out_free: +free_buffer: kfree(buffer); - out_put: + +put_sdev: spi_dv_in_progress(starget) = 0; scsi_device_put(sdev); -unlock: +put_autopm: + scsi_autopm_put_device(sdev); + +unlock_system_sleep: unlock_system_sleep(); } EXPORT_SYMBOL(spi_dv_device); From e6044f714b256259df9611ff49af433e5411c5c8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 8 Dec 2020 21:29:49 -0800 Subject: [PATCH 030/547] scsi: core: Only process PM requests if rpm_status != RPM_ACTIVE Instead of submitting all SCSI commands submitted with scsi_execute() to a SCSI device if rpm_status != RPM_ACTIVE, only submit RQF_PM (power management requests) if rpm_status != RPM_ACTIVE. This patch makes the SCSI core handle the runtime power management status (rpm_status) as it should be handled. Link: https://lore.kernel.org/r/20201209052951.16136-7-bvanassche@acm.org Cc: Can Guo Cc: Stanley Chu Cc: Alan Stern Cc: Ming Lei Cc: Rafael J. Wysocki Cc: Martin Kepplinger Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Jens Axboe Reviewed-by: Can Guo Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index b7ac14571415..91bc39a4c3c3 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -249,7 +249,8 @@ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, req = blk_get_request(sdev->request_queue, data_direction == DMA_TO_DEVICE ? - REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, BLK_MQ_REQ_PREEMPT); + REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, + rq_flags & RQF_PM ? BLK_MQ_REQ_PM : 0); if (IS_ERR(req)) return ret; rq = scsi_req(req); @@ -1206,6 +1207,8 @@ static blk_status_t scsi_device_state_check(struct scsi_device *sdev, struct request *req) { switch (sdev->sdev_state) { + case SDEV_CREATED: + return BLK_STS_OK; case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: /* @@ -1232,18 +1235,18 @@ scsi_device_state_check(struct scsi_device *sdev, struct request *req) return BLK_STS_RESOURCE; case SDEV_QUIESCE: /* - * If the devices is blocked we defer normal commands. + * If the device is blocked we only accept power management + * commands. */ - if (req && !(req->rq_flags & RQF_PREEMPT)) + if (req && WARN_ON_ONCE(!(req->rq_flags & RQF_PM))) return BLK_STS_RESOURCE; return BLK_STS_OK; default: /* * For any other not fully online state we only allow - * special commands. In particular any user initiated - * command is not allowed. + * power management commands. */ - if (req && !(req->rq_flags & RQF_PREEMPT)) + if (req && !(req->rq_flags & RQF_PM)) return BLK_STS_IOERR; return BLK_STS_OK; } @@ -2517,15 +2520,13 @@ void sdev_evt_send_simple(struct scsi_device *sdev, EXPORT_SYMBOL_GPL(sdev_evt_send_simple); /** - * scsi_device_quiesce - Block user issued commands. + * scsi_device_quiesce - Block all commands except power management. * @sdev: scsi device to quiesce. * * This works by trying to transition to the SDEV_QUIESCE state * (which must be a legal transition). When the device is in this - * state, only special requests will be accepted, all others will - * be deferred. Since special requests may also be requeued requests, - * a successful return doesn't guarantee the device will be - * totally quiescent. + * state, only power management requests will be accepted, all others will + * be deferred. * * Must be called with user context, may sleep. * @@ -2587,12 +2588,12 @@ void scsi_device_resume(struct scsi_device *sdev) * device deleted during suspend) */ mutex_lock(&sdev->state_mutex); + if (sdev->sdev_state == SDEV_QUIESCE) + scsi_device_set_state(sdev, SDEV_RUNNING); if (sdev->quiesced_by) { sdev->quiesced_by = NULL; blk_clear_pm_only(sdev->request_queue); } - if (sdev->sdev_state == SDEV_QUIESCE) - scsi_device_set_state(sdev, SDEV_RUNNING); mutex_unlock(&sdev->state_mutex); } EXPORT_SYMBOL(scsi_device_resume); From a4d34da715e3cb7e0741fe603dcd511bed067e00 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 8 Dec 2020 21:29:50 -0800 Subject: [PATCH 031/547] scsi: block: Remove RQF_PREEMPT and BLK_MQ_REQ_PREEMPT Remove flag RQF_PREEMPT and BLK_MQ_REQ_PREEMPT since these are no longer used by any kernel code. Link: https://lore.kernel.org/r/20201209052951.16136-8-bvanassche@acm.org Cc: Can Guo Cc: Stanley Chu Cc: Alan Stern Cc: Ming Lei Cc: Rafael J. Wysocki Cc: Martin Kepplinger Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Jens Axboe Reviewed-by: Can Guo Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- block/blk-core.c | 7 +++---- block/blk-mq-debugfs.c | 1 - block/blk-mq.c | 2 -- include/linux/blk-mq.h | 2 -- include/linux/blkdev.h | 6 +----- 5 files changed, 4 insertions(+), 14 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 10696f9fb6ac..a00bce9f46d8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -424,11 +424,11 @@ EXPORT_SYMBOL(blk_cleanup_queue); /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer - * @flags: BLK_MQ_REQ_NOWAIT, BLK_MQ_REQ_PM and/or BLK_MQ_REQ_PREEMPT + * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM */ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { - const bool pm = flags & (BLK_MQ_REQ_PM | BLK_MQ_REQ_PREEMPT); + const bool pm = flags & BLK_MQ_REQ_PM; while (true) { bool success = false; @@ -630,8 +630,7 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op, struct request *req; WARN_ON_ONCE(op & REQ_NOWAIT); - WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM | - BLK_MQ_REQ_PREEMPT)); + WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM)); req = blk_mq_alloc_request(q, op, flags); if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3094542e12ae..9336a6f8d6ef 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -297,7 +297,6 @@ static const char *const rqf_name[] = { RQF_NAME(MIXED_MERGE), RQF_NAME(MQ_INFLIGHT), RQF_NAME(DONTPREP), - RQF_NAME(PREEMPT), RQF_NAME(FAILED), RQF_NAME(QUIET), RQF_NAME(ELVPRIV), diff --git a/block/blk-mq.c b/block/blk-mq.c index b5880a1fb38d..d50504888b68 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -294,8 +294,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PM) rq->rq_flags |= RQF_PM; - if (data->flags & BLK_MQ_REQ_PREEMPT) - rq->rq_flags |= RQF_PREEMPT; if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; INIT_LIST_HEAD(&rq->queuelist); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c00e856c6fb1..88af1df94308 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -446,8 +446,6 @@ enum { BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), /* set RQF_PM */ BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), - /* set RQF_PREEMPT */ - BLK_MQ_REQ_PREEMPT = (__force blk_mq_req_flags_t)(1 << 3), }; struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 639cae2c158b..7d4b746f7e6a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -79,9 +79,6 @@ typedef __u32 __bitwise req_flags_t; #define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) /* don't call prep for this one */ #define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) -/* set for "ide_preempt" requests and also for requests for which the SCSI - "quiesce" state must be ignored. */ -#define RQF_PREEMPT ((__force req_flags_t)(1 << 8)) /* vaguely specified driver internal error. Ignored by the block layer */ #define RQF_FAILED ((__force req_flags_t)(1 << 10)) /* don't warn about errors */ @@ -430,8 +427,7 @@ struct request_queue { unsigned long queue_flags; /* * Number of contexts that have called blk_set_pm_only(). If this - * counter is above zero then only RQF_PM and RQF_PREEMPT requests are - * processed. + * counter is above zero then only RQF_PM requests are processed. */ atomic_t pm_only; From 52abca64fd9410ea6c9a3a74eab25663b403d7da Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 8 Dec 2020 21:29:51 -0800 Subject: [PATCH 032/547] scsi: block: Do not accept any requests while suspended blk_queue_enter() accepts BLK_MQ_REQ_PM requests independent of the runtime power management state. Now that SCSI domain validation no longer depends on this behavior, modify the behavior of blk_queue_enter() as follows: - Do not accept any requests while suspended. - Only process power management requests while suspending or resuming. Submitting BLK_MQ_REQ_PM requests to a device that is runtime suspended causes runtime-suspended devices not to resume as they should. The request which should cause a runtime resume instead gets issued directly, without resuming the device first. Of course the device can't handle it properly, the I/O fails, and the device remains suspended. The problem is fixed by checking that the queue's runtime-PM status isn't RPM_SUSPENDED before allowing a request to be issued, and queuing a runtime-resume request if it is. In particular, the inline blk_pm_request_resume() routine is renamed blk_pm_resume_queue() and the code is unified by merging the surrounding checks into the routine. If the queue isn't set up for runtime PM, or there currently is no restriction on allowed requests, the request is allowed. Likewise if the BLK_MQ_REQ_PM flag is set and the status isn't RPM_SUSPENDED. Otherwise a runtime resume is queued and the request is blocked until conditions are more suitable. [ bvanassche: modified commit message and removed Cc: stable because without the previous patches from this series this patch would break parallel SCSI domain validation + introduced queue_rpm_status() ] Link: https://lore.kernel.org/r/20201209052951.16136-9-bvanassche@acm.org Cc: Jens Axboe Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Can Guo Cc: Stanley Chu Cc: Ming Lei Cc: Rafael J. Wysocki Reported-and-tested-by: Martin Kepplinger Reviewed-by: Hannes Reinecke Reviewed-by: Can Guo Signed-off-by: Alan Stern Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- block/blk-core.c | 7 ++++--- block/blk-pm.h | 14 +++++++++----- include/linux/blkdev.h | 12 ++++++++++++ 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index a00bce9f46d8..2d53e2ff48ff 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -440,7 +441,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) * responsible for ensuring that that counter is * globally visible before the queue is unfrozen. */ - if (pm || !blk_queue_pm_only(q)) { + if ((pm && queue_rpm_status(q) != RPM_SUSPENDED) || + !blk_queue_pm_only(q)) { success = true; } else { percpu_ref_put(&q->q_usage_counter); @@ -465,8 +467,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && - (pm || (blk_pm_request_resume(q), - !blk_queue_pm_only(q)))) || + blk_pm_resume_queue(pm, q)) || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; diff --git a/block/blk-pm.h b/block/blk-pm.h index ea5507d23e75..a2283cc9f716 100644 --- a/block/blk-pm.h +++ b/block/blk-pm.h @@ -6,11 +6,14 @@ #include #ifdef CONFIG_PM -static inline void blk_pm_request_resume(struct request_queue *q) +static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { - if (q->dev && (q->rpm_status == RPM_SUSPENDED || - q->rpm_status == RPM_SUSPENDING)) - pm_request_resume(q->dev); + if (!q->dev || !blk_queue_pm_only(q)) + return 1; /* Nothing to do */ + if (pm && q->rpm_status != RPM_SUSPENDED) + return 1; /* Request allowed */ + pm_request_resume(q->dev); + return 0; } static inline void blk_pm_mark_last_busy(struct request *rq) @@ -44,8 +47,9 @@ static inline void blk_pm_put_request(struct request *rq) --rq->q->nr_pending; } #else -static inline void blk_pm_request_resume(struct request_queue *q) +static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { + return 1; } static inline void blk_pm_mark_last_busy(struct request *rq) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7d4b746f7e6a..2b6fc3fb3a99 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -692,6 +692,18 @@ static inline bool queue_is_mq(struct request_queue *q) return q->mq_ops; } +#ifdef CONFIG_PM +static inline enum rpm_status queue_rpm_status(struct request_queue *q) +{ + return q->rpm_status; +} +#else +static inline enum rpm_status queue_rpm_status(struct request_queue *q) +{ + return RPM_ACTIVE; +} +#endif + static inline enum blk_zoned_model blk_queue_zoned_model(struct request_queue *q) { From 8b3c8035297e71abb9e6d0f50ceab50d33c0d64b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 9 Dec 2020 17:03:11 +0300 Subject: [PATCH 033/547] scsi: mpt3sas: Signedness bug in _base_get_diag_triggers() The "trigger_flags" variable needs to be signed for the error checking to work. Link: https://lore.kernel.org/r/X9DZH37bYPHwSQRP@mwanda Fixes: aec93e8e2385 ("scsi: mpt3sas: Add persistent trigger pages support") Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index b129f3734ed0..26537d503a8b 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -5027,7 +5027,7 @@ _base_check_for_trigger_pages_support(struct MPT3SAS_ADAPTER *ioc) static void _base_get_diag_triggers(struct MPT3SAS_ADAPTER *ioc) { - u16 trigger_flags; + int trigger_flags; /* * Default setting of master trigger. From 5213dc7940e0aa4c094413e015790c4a310ef36c Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 9 Dec 2020 14:31:44 +0800 Subject: [PATCH 034/547] scsi: ufs-mediatek: Use correct path to fix compile error When the kernel is compiled with allmodconfig, the following error is reported: In file included from drivers/scsi/ufs/ufs-mediatek-trace.h:36:0, from drivers/scsi/ufs/ufs-mediatek.c:28: ./include/trace/define_trace.h:95:42: fatal error: ./ufs-mediatek-trace.h: No such file or directory #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) The comment in include/trace/define_trace.h specifies that: TRACE_INCLUDE_PATH: Note, the path is relative to define_trace.h, not the file including it. Full path names for out of tree modules must be used. So without "CFLAGS_ufs-mediatek.o := -I$(src)", the current directory "." is "include/trace/", the relative path of ufs-mediatek-trace.h is "../../drivers/scsi/ufs/". Link: https://lore.kernel.org/r/20201209063144.1840-2-thunder.leizhen@huawei.com Fixes: ca1bb061d644 ("scsi: ufs-mediatek: Introduce event_notify implementation") Reviewed-by: Stanley Chu Signed-off-by: Zhen Lei Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufs-mediatek-trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufs-mediatek-trace.h b/drivers/scsi/ufs/ufs-mediatek-trace.h index fd6f84c1b4e2..895e82ea6ece 100644 --- a/drivers/scsi/ufs/ufs-mediatek-trace.h +++ b/drivers/scsi/ufs/ufs-mediatek-trace.h @@ -31,6 +31,6 @@ TRACE_EVENT(ufs_mtk_event, #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_PATH ../../drivers/scsi/ufs/ #define TRACE_INCLUDE_FILE ufs-mediatek-trace #include From bd14bf0e4a084514aa62d24d2109e0f09a93822f Mon Sep 17 00:00:00 2001 From: Stanley Chu Date: Tue, 8 Dec 2020 21:56:34 +0800 Subject: [PATCH 035/547] scsi: ufs: Re-enable WriteBooster after device reset UFS 3.1 specification mentions that the WriteBooster flags listed below will be set to their default values, i.e. disabled, after power cycle or any type of reset event. Thus we need to reset the flag variables kept in struct hba to align with the device status and ensure that WriteBooster-related functions are configured properly after device reset. Without this fix, WriteBooster will not be enabled successfully after by ufshcd_wb_ctrl() after device reset because hba->wb_enabled remains true. Flags required to be reset to default values: - fWriteBoosterEn: hba->wb_enabled - fWriteBoosterBufferFlushEn: hba->wb_buf_flush_enabled - fWriteBoosterBufferFlushDuringHibernate: No variable mapped Link: https://lore.kernel.org/r/20201208135635.15326-2-stanley.chu@mediatek.com Fixes: 3d17b9b5ab11 ("scsi: ufs: Add write booster feature support") Reviewed-by: Bean Huo Signed-off-by: Stanley Chu Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 08c8a591e6b0..36d367eb8139 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -1221,8 +1221,13 @@ static inline void ufshcd_vops_device_reset(struct ufs_hba *hba) if (hba->vops && hba->vops->device_reset) { int err = hba->vops->device_reset(hba); - if (!err) + if (!err) { ufshcd_set_ufs_dev_active(hba); + if (ufshcd_is_wb_allowed(hba)) { + hba->wb_enabled = false; + hba->wb_buf_flush_enabled = false; + } + } if (err != -EOPNOTSUPP) ufshcd_update_evt_hist(hba, UFS_EVT_DEV_RESET, err); } From 31a5d9cafff163473abf9496318b6a53022d48f7 Mon Sep 17 00:00:00 2001 From: Stanley Chu Date: Tue, 8 Dec 2020 21:56:35 +0800 Subject: [PATCH 036/547] scsi: ufs: Un-inline ufshcd_vops_device_reset function More and more statements are being added to ufshcd_vops_device_reset() and this function is being called from multiple locations in the driver. Un-inline the function to allow the compiler to make better decisions. Link: https://lore.kernel.org/r/20201208135635.15326-3-stanley.chu@mediatek.com Reviewed-by: Bean Huo Signed-off-by: Stanley Chu Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 27 ++++++++++++++++++++++----- drivers/scsi/ufs/ufshcd.h | 17 ++++------------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 4f3ac2566322..e221add25a7e 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -581,6 +581,23 @@ static void ufshcd_print_pwr_info(struct ufs_hba *hba) hba->pwr_info.hs_rate); } +static void ufshcd_device_reset(struct ufs_hba *hba) +{ + int err; + + err = ufshcd_vops_device_reset(hba); + + if (!err) { + ufshcd_set_ufs_dev_active(hba); + if (ufshcd_is_wb_allowed(hba)) { + hba->wb_enabled = false; + hba->wb_buf_flush_enabled = false; + } + } + if (err != -EOPNOTSUPP) + ufshcd_update_evt_hist(hba, UFS_EVT_DEV_RESET, err); +} + void ufshcd_delay_us(unsigned long us, unsigned long tolerance) { if (!us) @@ -3933,7 +3950,7 @@ int ufshcd_link_recovery(struct ufs_hba *hba) spin_unlock_irqrestore(hba->host->host_lock, flags); /* Reset the attached device */ - ufshcd_vops_device_reset(hba); + ufshcd_device_reset(hba); ret = ufshcd_host_reset_and_restore(hba); @@ -6935,7 +6952,7 @@ static int ufshcd_reset_and_restore(struct ufs_hba *hba) do { /* Reset the attached device */ - ufshcd_vops_device_reset(hba); + ufshcd_device_reset(hba); err = ufshcd_host_reset_and_restore(hba); } while (err && --retries); @@ -8708,7 +8725,7 @@ set_link_active: * further below. */ if (ufshcd_is_ufs_dev_deepsleep(hba)) { - ufshcd_vops_device_reset(hba); + ufshcd_device_reset(hba); WARN_ON(!ufshcd_is_link_off(hba)); } if (ufshcd_is_link_hibern8(hba) && !ufshcd_uic_hibern8_exit(hba)) @@ -8718,7 +8735,7 @@ set_link_active: set_dev_active: /* Can also get here needing to exit DeepSleep */ if (ufshcd_is_ufs_dev_deepsleep(hba)) { - ufshcd_vops_device_reset(hba); + ufshcd_device_reset(hba); ufshcd_host_reset_and_restore(hba); } if (!ufshcd_set_dev_pwr_mode(hba, UFS_ACTIVE_PWR_MODE)) @@ -9317,7 +9334,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) } /* Reset the attached device */ - ufshcd_vops_device_reset(hba); + ufshcd_device_reset(hba); ufshcd_init_crypto(hba); diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h index 36d367eb8139..9bb5f0ed4124 100644 --- a/drivers/scsi/ufs/ufshcd.h +++ b/drivers/scsi/ufs/ufshcd.h @@ -1216,21 +1216,12 @@ static inline void ufshcd_vops_dbg_register_dump(struct ufs_hba *hba) hba->vops->dbg_register_dump(hba); } -static inline void ufshcd_vops_device_reset(struct ufs_hba *hba) +static inline int ufshcd_vops_device_reset(struct ufs_hba *hba) { - if (hba->vops && hba->vops->device_reset) { - int err = hba->vops->device_reset(hba); + if (hba->vops && hba->vops->device_reset) + return hba->vops->device_reset(hba); - if (!err) { - ufshcd_set_ufs_dev_active(hba); - if (ufshcd_is_wb_allowed(hba)) { - hba->wb_enabled = false; - hba->wb_buf_flush_enabled = false; - } - } - if (err != -EOPNOTSUPP) - ufshcd_update_evt_hist(hba, UFS_EVT_DEV_RESET, err); - } + return -EOPNOTSUPP; } static inline void ufshcd_vops_config_scaling_param(struct ufs_hba *hba, From cb5253198f10a4cd79b7523c581e6173c7d49ddb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 8 Dec 2020 14:05:05 -0800 Subject: [PATCH 037/547] scsi: cxgb4i: Fix TLS dependency SCSI_CXGB4_ISCSI selects CHELSIO_T4. The latter depends on TLS || TLS=n, so since 'select' does not check dependencies of the selected symbol, SCSI_CXGB4_ISCSI should also depend on TLS || TLS=n. This prevents the following kconfig warning and restricts SCSI_CXGB4_ISCSI to 'm' whenever TLS=m. WARNING: unmet direct dependencies detected for CHELSIO_T4 Depends on [m]: NETDEVICES [=y] && ETHERNET [=y] && NET_VENDOR_CHELSIO [=y] && PCI [=y] && (IPV6 [=y] || IPV6 [=y]=n) && (TLS [=m] || TLS [=m]=n) Selected by [y]: - SCSI_CXGB4_ISCSI [=y] && SCSI_LOWLEVEL [=y] && SCSI [=y] && PCI [=y] && INET [=y] && (IPV6 [=y] || IPV6 [=y]=n) && ETHERNET [=y] Link: https://lore.kernel.org/r/20201208220505.24488-1-rdunlap@infradead.org Fixes: 7b36b6e03b0d ("[SCSI] cxgb4i v5: iscsi driver") Cc: Karen Xie Cc: linux-scsi@vger.kernel.org Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Signed-off-by: Randy Dunlap Signed-off-by: Martin K. Petersen --- drivers/scsi/cxgbi/cxgb4i/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/cxgbi/cxgb4i/Kconfig b/drivers/scsi/cxgbi/cxgb4i/Kconfig index b206e266b4e7..8b0deece9758 100644 --- a/drivers/scsi/cxgbi/cxgb4i/Kconfig +++ b/drivers/scsi/cxgbi/cxgb4i/Kconfig @@ -4,6 +4,7 @@ config SCSI_CXGB4_ISCSI depends on PCI && INET && (IPV6 || IPV6=n) depends on THERMAL || !THERMAL depends on ETHERNET + depends on TLS || TLS=n select NET_VENDOR_CHELSIO select CHELSIO_T4 select CHELSIO_LIB From 43ffe817bfe3871ffbaa1e98952a2a01b140e71e Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 10 Dec 2020 15:10:56 +0530 Subject: [PATCH 038/547] arm64: dts: bitmain: Use generic "ngpios" rather than "snps,nr-gpios" This is to remove similar errors as below: OF: /.../gpio-port@0: could not find phandle Commit 7569486d79ae ("gpio: dwapb: Add ngpios DT-property support") explained the reason of above errors well and added the generic "ngpios" property, let's use it. Signed-off-by: Jisheng Zhang Signed-off-by: Manivannan Sadhasivam Reviewed-by: Linus Walleij Acked-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20201210094056.54553-1-manivannan.sadhasivam@linaro.org' Signed-off-by: Arnd Bergmann --- arch/arm64/boot/dts/bitmain/bm1880.dtsi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/boot/dts/bitmain/bm1880.dtsi b/arch/arm64/boot/dts/bitmain/bm1880.dtsi index fa6e6905f588..53a9b76057aa 100644 --- a/arch/arm64/boot/dts/bitmain/bm1880.dtsi +++ b/arch/arm64/boot/dts/bitmain/bm1880.dtsi @@ -127,7 +127,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <32>; + ngpios = <32>; reg = <0>; interrupt-controller; #interrupt-cells = <2>; @@ -145,7 +145,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <32>; + ngpios = <32>; reg = <0>; interrupt-controller; #interrupt-cells = <2>; @@ -163,7 +163,7 @@ compatible = "snps,dw-apb-gpio-port"; gpio-controller; #gpio-cells = <2>; - snps,nr-gpios = <8>; + ngpios = <8>; reg = <0>; interrupt-controller; #interrupt-cells = <2>; From 1b04fa9900263b4e217ca2509fd778b32c2b4eb2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 9 Dec 2020 21:27:31 +0100 Subject: [PATCH 039/547] rcu-tasks: Move RCU-tasks initialization to before early_initcall() PowerPC testing encountered boot failures due to RCU Tasks not being fully initialized until core_initcall() time. This commit therefore initializes RCU Tasks (along with Rude RCU and RCU Tasks Trace) just before early_initcall() time, thus allowing waiting on RCU Tasks grace periods from early_initcall() handlers. Link: https://lore.kernel.org/rcu/87eekfh80a.fsf@dja-thinkpad.axtens.net/ Fixes: 36dadef23fcc ("kprobes: Init kprobes in early_initcall") Tested-by: Daniel Axtens Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 6 ++++++ init/main.c | 1 + kernel/rcu/tasks.h | 25 +++++++++++++++++++++---- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 6cdd0152c253..5c119d6cecf1 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -86,6 +86,12 @@ void rcu_sched_clock_irq(int user); void rcu_report_dead(unsigned int cpu); void rcutree_migrate_callbacks(int cpu); +#ifdef CONFIG_TASKS_RCU_GENERIC +void rcu_init_tasks_generic(void); +#else +static inline void rcu_init_tasks_generic(void) { } +#endif + #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); diff --git a/init/main.c b/init/main.c index 32b2a8affafd..9d964511fe0c 100644 --- a/init/main.c +++ b/init/main.c @@ -1512,6 +1512,7 @@ static noinline void __init kernel_init_freeable(void) init_mm_internals(); + rcu_init_tasks_generic(); do_pre_smp_initcalls(); lockup_detector_init(); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d5d9f2d03e8a..73bbe792fe1e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -241,7 +241,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } -/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */ +/* Spawn RCU-tasks grace-period kthread. */ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) { struct task_struct *t; @@ -569,7 +569,6 @@ static int __init rcu_spawn_tasks_kthread(void) rcu_spawn_tasks_kthread_generic(&rcu_tasks); return 0; } -core_initcall(rcu_spawn_tasks_kthread); #ifndef CONFIG_TINY_RCU static void show_rcu_tasks_classic_gp_kthread(void) @@ -697,7 +696,6 @@ static int __init rcu_spawn_tasks_rude_kthread(void) rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } -core_initcall(rcu_spawn_tasks_rude_kthread); #ifndef CONFIG_TINY_RCU static void show_rcu_tasks_rude_gp_kthread(void) @@ -975,6 +973,11 @@ static void rcu_tasks_trace_pregp_step(void) static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { + // During early boot when there is only the one boot CPU, there + // is no idle task for the other CPUs. Just return. + if (unlikely(t == NULL)) + return; + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); WRITE_ONCE(t->trc_reader_checked, false); t->trc_ipi_to_cpu = -1; @@ -1200,7 +1203,6 @@ static int __init rcu_spawn_tasks_trace_kthread(void) rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); return 0; } -core_initcall(rcu_spawn_tasks_trace_kthread); #ifndef CONFIG_TINY_RCU static void show_rcu_tasks_trace_gp_kthread(void) @@ -1229,6 +1231,21 @@ void show_rcu_tasks_gp_kthreads(void) } #endif /* #ifndef CONFIG_TINY_RCU */ +void __init rcu_init_tasks_generic(void) +{ +#ifdef CONFIG_TASKS_RCU + rcu_spawn_tasks_kthread(); +#endif + +#ifdef CONFIG_TASKS_RUDE_RCU + rcu_spawn_tasks_rude_kthread(); +#endif + +#ifdef CONFIG_TASKS_TRACE_RCU + rcu_spawn_tasks_trace_kthread(); +#endif +} + #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} void show_rcu_tasks_gp_kthreads(void) {} From 57f04815fd95bb8c46f6ec5c9d25430bb52d419f Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 10 Dec 2020 09:40:28 -0800 Subject: [PATCH 040/547] drm/msm: Fix WARN_ON() splat in _free_object() [ 192.062000] ------------[ cut here ]------------ [ 192.062498] WARNING: CPU: 3 PID: 2039 at drivers/gpu/drm/msm/msm_gem.c:381 put_iova_vmas+0x94/0xa0 [msm] [ 192.062870] Modules linked in: snd_hrtimer snd_seq snd_seq_device rfcomm algif_hash algif_skcipher af_alg bnep xt_CHECKSUM nft_chain_nat xt_MASQUERADE nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_counter xt_tcpudp nft_compat cpufreq_powersave cpufreq_conservative q6asm_dai q6routing q6afe_dai q6adm bridge q6afe q6asm q6dsp_common q6core stp llc nf_tables libcrc32c nfnetlink snd_soc_wsa881x regmap_sdw soundwire_qcom gpio_wcd934x snd_soc_wcd934x wcd934x regmap_slimbus venus_enc venus_dec apr videobuf2_dma_sg qrtr_smd uvcvideo videobuf2_vmalloc videobuf2_memops ath10k_snoc ath10k_core hci_uart btqca btbcm mac80211 bluetooth snd_soc_sdm845 ath snd_soc_rt5663 snd_soc_qcom_common snd_soc_rl6231 soundwire_bus ecdh_generic ecc qcom_spmi_adc5 venus_core qcom_pon qcom_spmi_temp_alarm qcom_vadc_common v4l2_mem2mem videobuf2_v4l2 cfg80211 videobuf2_common hid_multitouch reset_qcom_pdc qcrypto qcom_rng rfkill qcom_q6v5_mss libarc4 libdes qrtr ns qcom_wdt socinfo slim_qcom_ngd_ctrl [ 192.065739] pdr_interface qcom_q6v5_pas slimbus qcom_pil_info qcom_q6v5 qcom_sysmon qcom_common qcom_glink_smem qmi_helpers rmtfs_mem tcp_bbr sch_fq fuse ip_tables x_tables ipv6 crc_ccitt ti_sn65dsi86 i2c_hid msm mdt_loader llcc_qcom rtc_pm8xxx ocmem drm_kms_helper crct10dif_ce phy_qcom_qusb2 i2c_qcom_geni panel_simple drm pwm_bl [ 192.066066] CPU: 3 PID: 2039 Comm: gnome-shell Tainted: G W 5.10.0-rc7-next-20201208 #1 [ 192.066068] Hardware name: LENOVO 81JL/LNVNB161216, BIOS 9UCN33WW(V2.06) 06/ 4/2019 [ 192.066072] pstate: 40400005 (nZcv daif +PAN -UAO -TCO BTYPE=--) [ 192.066099] pc : put_iova_vmas+0x94/0xa0 [msm] [ 192.066262] lr : put_iova_vmas+0x1c/0xa0 [msm] [ 192.066403] sp : ffff800019efbbb0 [ 192.066405] x29: ffff800019efbbb0 x28: ffff800019efbd88 [ 192.066411] x27: 0000000000000000 x26: ffff109582efa400 [ 192.066417] x25: 0000000000000009 x24: 000000000000012b [ 192.066422] x23: ffff109582efa438 x22: ffff109582efa450 [ 192.066427] x21: ffff109582efa528 x20: ffff1095cbd4f200 [ 192.066432] x19: ffff1095cbd4f200 x18: 0000000000000000 [ 192.066438] x17: 0000000000000000 x16: ffffc26c200ca750 [ 192.066727] x15: 0000000000000000 x14: 0000000000000000 [ 192.066741] x13: ffff1096fb8c9100 x12: 0000000000000002 [ 192.066754] x11: ffffffffffffffff x10: 0000000000000002 [ 192.067046] x9 : 0000000000000001 x8 : 0000000000000a36 [ 192.067060] x7 : ffff4e2ad9f11000 x6 : ffffc26c216d4000 [ 192.067212] x5 : ffffc26c2022661c x4 : ffff1095c2b98000 [ 192.067367] x3 : ffff1095cbd4f300 x2 : 0000000000000000 [ 192.067380] x1 : ffff1095c2b98000 x0 : 0000000000000000 [ 192.067667] Call trace: [ 192.067734] put_iova_vmas+0x94/0xa0 [msm] [ 192.068078] msm_gem_free_object+0xb4/0x110 [msm] [ 192.068399] drm_gem_object_free+0x1c/0x30 [drm] [ 192.068717] drm_gem_object_handle_put_unlocked+0xf0/0xf8 [drm] [ 192.069032] drm_gem_object_release_handle+0x6c/0x88 [drm] [ 192.069349] drm_gem_handle_delete+0x68/0xc0 [drm] [ 192.069666] drm_gem_close_ioctl+0x30/0x48 [drm] [ 192.069984] drm_ioctl_kernel+0xc0/0x110 [drm] [ 192.070303] drm_ioctl+0x210/0x440 [drm] [ 192.070588] __arm64_sys_ioctl+0xa8/0xf0 [ 192.070599] el0_svc_common.constprop.0+0x74/0x190 [ 192.070608] do_el0_svc+0x24/0x90 [ 192.070618] el0_svc+0x14/0x20 [ 192.070903] el0_sync_handler+0xb0/0xb8 [ 192.070911] el0_sync+0x174/0x180 [ 192.070918] ---[ end trace bee6b12a899001a3 ]--- [ 192.072140] ------------[ cut here ]------------ Fixes: 9b73bde39cf2 ("drm/msm: Fix use-after-free in msm_gem with carveout") Signed-off-by: Rob Clark Acked-by: Iskren Chernev --- drivers/gpu/drm/msm/msm_gem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 68a6c7eacc0a..a21be5b910ff 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -990,6 +990,8 @@ void msm_gem_free_object(struct drm_gem_object *obj) if (msm_obj->pages) kvfree(msm_obj->pages); + put_iova_vmas(obj); + /* dma_buf_detach() grabs resv lock, so we need to unlock * prior to drm_prime_gem_destroy */ @@ -999,11 +1001,10 @@ void msm_gem_free_object(struct drm_gem_object *obj) } else { msm_gem_vunmap(obj); put_pages(obj); + put_iova_vmas(obj); msm_gem_unlock(obj); } - put_iova_vmas(obj); - drm_gem_object_release(obj); kfree(msm_obj); From 161b838e25c6f83495e27e3f546b893622d442bf Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 14 Dec 2020 23:40:15 +0000 Subject: [PATCH 041/547] netfilter: nftables: fix incorrect increment of loop counter The intention of the err_expr cleanup path is to iterate over the allocated expr_array objects and free them, starting from i - 1 and working down to the start of the array. Currently the loop counter is being incremented instead of decremented and also the index i is being used instead of k, repeatedly destroying the same expr_array element. Fix this by decrementing k and using k as the index into expr_array. Addresses-Coverity: ("Infinite loop") Fixes: 8cfd9b0f8515 ("netfilter: nftables: generalize set expressions support") Signed-off-by: Colin Ian King Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8d5aa0ac45f4..4186b1e52d58 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5254,8 +5254,8 @@ static int nft_set_elem_expr_clone(const struct nft_ctx *ctx, return 0; err_expr: - for (k = i - 1; k >= 0; k++) - nft_expr_destroy(ctx, expr_array[i]); + for (k = i - 1; k >= 0; k--) + nft_expr_destroy(ctx, expr_array[k]); return -ENOMEM; } From df9716ec9ade3d2e190a2aac199557d30a3a8416 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 15 Dec 2020 13:20:24 +0000 Subject: [PATCH 042/547] regulator: pf8x00: Use specific compatible strings for devices The pf8x00 driver supports three devices, the DT compatible strings and I2C IDs should enumerate these specifically rather than using a wildcard so that we don't collide with anything incompatible in the same ID range in the future and so that we can handle any software visible differences between the variants we find. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20201215132024.13356-1-broonie@kernel.org Signed-off-by: Mark Brown --- .../bindings/regulator/nxp,pf8x00-regulator.yaml | 6 ++++-- drivers/regulator/pf8x00-regulator.c | 8 ++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/regulator/nxp,pf8x00-regulator.yaml b/Documentation/devicetree/bindings/regulator/nxp,pf8x00-regulator.yaml index a6c259ce9785..956156fe52a3 100644 --- a/Documentation/devicetree/bindings/regulator/nxp,pf8x00-regulator.yaml +++ b/Documentation/devicetree/bindings/regulator/nxp,pf8x00-regulator.yaml @@ -19,7 +19,9 @@ description: | properties: compatible: enum: - - nxp,pf8x00 + - nxp,pf8100 + - nxp,pf8121a + - nxp,pf8200 reg: maxItems: 1 @@ -118,7 +120,7 @@ examples: #size-cells = <0>; pmic@8 { - compatible = "nxp,pf8x00"; + compatible = "nxp,pf8100"; reg = <0x08>; regulators { diff --git a/drivers/regulator/pf8x00-regulator.c b/drivers/regulator/pf8x00-regulator.c index 308c27fa6ea8..af9918cd27aa 100644 --- a/drivers/regulator/pf8x00-regulator.c +++ b/drivers/regulator/pf8x00-regulator.c @@ -469,13 +469,17 @@ static int pf8x00_i2c_probe(struct i2c_client *client) } static const struct of_device_id pf8x00_dt_ids[] = { - { .compatible = "nxp,pf8x00",}, + { .compatible = "nxp,pf8100",}, + { .compatible = "nxp,pf8121a",}, + { .compatible = "nxp,pf8200",}, { } }; MODULE_DEVICE_TABLE(of, pf8x00_dt_ids); static const struct i2c_device_id pf8x00_i2c_id[] = { - { "pf8x00", 0 }, + { "pf8100", 0 }, + { "pf8121a", 0 }, + { "pf8200", 0 }, {}, }; MODULE_DEVICE_TABLE(i2c, pf8x00_i2c_id); From 2d18e54dd8662442ef5898c6bdadeaf90b3cebbc Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Thu, 10 Dec 2020 09:29:43 +0800 Subject: [PATCH 043/547] cgroup: Fix memory leak when parsing multiple source parameters A memory leak is found in cgroup1_parse_param() when multiple source parameters overwrite fc->source in the fs_context struct without free. unreferenced object 0xffff888100d930e0 (size 16): comm "mount", pid 520, jiffies 4303326831 (age 152.783s) hex dump (first 16 bytes): 74 65 73 74 6c 65 61 6b 00 00 00 00 00 00 00 00 testleak........ backtrace: [<000000003e5023ec>] kmemdup_nul+0x2d/0xa0 [<00000000377dbdaa>] vfs_parse_fs_string+0xc0/0x150 [<00000000cb2b4882>] generic_parse_monolithic+0x15a/0x1d0 [<000000000f750198>] path_mount+0xee1/0x1820 [<0000000004756de2>] do_mount+0xea/0x100 [<0000000094cafb0a>] __x64_sys_mount+0x14b/0x1f0 Fix this bug by permitting a single source parameter and rejecting with an error all subsequent ones. Fixes: 8d2451f4994f ("cgroup1: switch to option-by-option parsing") Reported-by: Hulk Robot Signed-off-by: Qinglang Miao Reviewed-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 191c329e482a..32596fdbcd5b 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -908,6 +908,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { if (strcmp(param->key, "source") == 0) { + if (fc->source) + return invalf(fc, "Multiple sources not supported"); fc->source = param->string; param->string = NULL; return 0; From 1a3449c19407a28f7019a887cdf0d6ba2444751a Mon Sep 17 00:00:00 2001 From: Kamal Mostafa Date: Tue, 15 Dec 2020 10:20:10 -0800 Subject: [PATCH 044/547] selftests/bpf: Clarify build error if no vmlinux If Makefile cannot find any of the vmlinux's in its VMLINUX_BTF_PATHS list, it tries to run btftool incorrectly, with VMLINUX_BTF unset: bpftool btf dump file $(VMLINUX_BTF) format c Such that the keyword 'format' is misinterpreted as the path to vmlinux. The resulting build error message is fairly cryptic: GEN vmlinux.h Error: failed to load BTF from format: No such file or directory This patch makes the failure reason clearer by yielding this instead: Makefile:...: *** Cannot find a vmlinux for VMLINUX_BTF at any of "{paths}". Stop. Fixes: acbd06206bbb ("selftests/bpf: Add vmlinux.h selftest exercising tracing of syscalls") Signed-off-by: Kamal Mostafa Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201215182011.15755-1-kamal@canonical.com --- tools/testing/selftests/bpf/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 8c33e999319a..c51df6b91bef 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -121,6 +121,9 @@ VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ /sys/kernel/btf/vmlinux \ /boot/vmlinux-$(shell uname -r) VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif # Define simple and short `make test_progs`, `make test_sysctl`, etc targets # to build individual tests. From 81e7eb5bf08f36d34495a5898f6ef3fec05d9776 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 16 Dec 2020 22:43:44 -0500 Subject: [PATCH 045/547] Revert "Revert "scsi: megaraid_sas: Added support for shared host tagset for cpuhotplug"" This reverts commit 1a0e1943d8798cb3241fb5edb9a836af1611b60a. Commit b3c6a5997541 ("block: Fix a lockdep complaint triggered by request queue flushing") has been reverted and commit fb01a2932e81 has been introduced in its place. Consequently, it is now safe to reinstate the megaraid_sas tagset changes that led to boot problems in 5.10. Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 39 +++++++++++++++++++++ drivers/scsi/megaraid/megaraid_sas_fusion.c | 29 ++++++++------- 2 files changed, 55 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index e158d3d62056..41cd66fc7d81 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -113,6 +114,10 @@ unsigned int enable_sdev_max_qd; module_param(enable_sdev_max_qd, int, 0444); MODULE_PARM_DESC(enable_sdev_max_qd, "Enable sdev max qd as can_queue. Default: 0"); +int host_tagset_enable = 1; +module_param(host_tagset_enable, int, 0444); +MODULE_PARM_DESC(host_tagset_enable, "Shared host tagset enable/disable Default: enable(1)"); + MODULE_LICENSE("GPL"); MODULE_VERSION(MEGASAS_VERSION); MODULE_AUTHOR("megaraidlinux.pdl@broadcom.com"); @@ -3119,6 +3124,19 @@ megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev, return 0; } +static int megasas_map_queues(struct Scsi_Host *shost) +{ + struct megasas_instance *instance; + + instance = (struct megasas_instance *)shost->hostdata; + + if (shost->nr_hw_queues == 1) + return 0; + + return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], + instance->pdev, instance->low_latency_index_start); +} + static void megasas_aen_polling(struct work_struct *work); /** @@ -3427,6 +3445,7 @@ static struct scsi_host_template megasas_template = { .eh_timed_out = megasas_reset_timer, .shost_attrs = megaraid_host_attrs, .bios_param = megasas_bios_param, + .map_queues = megasas_map_queues, .change_queue_depth = scsi_change_queue_depth, .max_segment_size = 0xffffffff, }; @@ -6808,6 +6827,26 @@ static int megasas_io_attach(struct megasas_instance *instance) host->max_lun = MEGASAS_MAX_LUN; host->max_cmd_len = 16; + /* Use shared host tagset only for fusion adaptors + * if there are managed interrupts (smp affinity enabled case). + * Single msix_vectors in kdump, so shared host tag is also disabled. + */ + + host->host_tagset = 0; + host->nr_hw_queues = 1; + + if ((instance->adapter_type != MFI_SERIES) && + (instance->msix_vectors > instance->low_latency_index_start) && + host_tagset_enable && + instance->smp_affinity_enable) { + host->host_tagset = 1; + host->nr_hw_queues = instance->msix_vectors - + instance->low_latency_index_start; + } + + dev_info(&instance->pdev->dev, + "Max firmware commands: %d shared with nr_hw_queues = %d\n", + instance->max_fw_cmds, host->nr_hw_queues); /* * Notify the mid-layer about the new controller */ diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c index b0c01cf0428f..fd607287608e 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c @@ -359,24 +359,29 @@ megasas_get_msix_index(struct megasas_instance *instance, { int sdev_busy; - /* nr_hw_queue = 1 for MegaRAID */ - struct blk_mq_hw_ctx *hctx = - scmd->device->request_queue->queue_hw_ctx[0]; - - sdev_busy = atomic_read(&hctx->nr_active); + /* TBD - if sml remove device_busy in future, driver + * should track counter in internal structure. + */ + sdev_busy = atomic_read(&scmd->device->device_busy); if (instance->perf_mode == MR_BALANCED_PERF_MODE && - sdev_busy > (data_arms * MR_DEVICE_HIGH_IOPS_DEPTH)) + sdev_busy > (data_arms * MR_DEVICE_HIGH_IOPS_DEPTH)) { cmd->request_desc->SCSIIO.MSIxIndex = mega_mod64((atomic64_add_return(1, &instance->high_iops_outstanding) / MR_HIGH_IOPS_BATCH_COUNT), instance->low_latency_index_start); - else if (instance->msix_load_balance) + } else if (instance->msix_load_balance) { cmd->request_desc->SCSIIO.MSIxIndex = (mega_mod64(atomic64_add_return(1, &instance->total_io_count), instance->msix_vectors)); - else + } else if (instance->host->nr_hw_queues > 1) { + u32 tag = blk_mq_unique_tag(scmd->request); + + cmd->request_desc->SCSIIO.MSIxIndex = blk_mq_unique_tag_to_hwq(tag) + + instance->low_latency_index_start; + } else { cmd->request_desc->SCSIIO.MSIxIndex = instance->reply_map[raw_smp_processor_id()]; + } } /** @@ -956,9 +961,6 @@ megasas_alloc_cmds_fusion(struct megasas_instance *instance) if (megasas_alloc_cmdlist_fusion(instance)) goto fail_exit; - dev_info(&instance->pdev->dev, "Configured max firmware commands: %d\n", - instance->max_fw_cmds); - /* The first 256 bytes (SMID 0) is not used. Don't add to the cmd list */ io_req_base = fusion->io_request_frames + MEGA_MPI2_RAID_DEFAULT_IO_FRAME_SIZE; io_req_base_phys = fusion->io_request_frames_phys + MEGA_MPI2_RAID_DEFAULT_IO_FRAME_SIZE; @@ -1102,8 +1104,9 @@ megasas_ioc_init_fusion(struct megasas_instance *instance) MR_HIGH_IOPS_QUEUE_COUNT) && cur_intr_coalescing) instance->perf_mode = MR_BALANCED_PERF_MODE; - dev_info(&instance->pdev->dev, "Performance mode :%s\n", - MEGASAS_PERF_MODE_2STR(instance->perf_mode)); + dev_info(&instance->pdev->dev, "Performance mode :%s (latency index = %d)\n", + MEGASAS_PERF_MODE_2STR(instance->perf_mode), + instance->low_latency_index_start); instance->fw_sync_cache_support = (scratch_pad_1 & MR_CAN_HANDLE_SYNC_CACHE_OFFSET) ? 1 : 0; From 292bff9480c8d52fc58028979c4162abd83f1aec Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 14 Dec 2020 23:24:17 +0000 Subject: [PATCH 046/547] ath11k: add missing null check on allocated skb Currently the null check on a newly allocated skb is missing and this can lead to a null pointer dereference is the allocation fails. Fix this by adding a null check and returning -ENOMEM. Addresses-Coverity: ("Dereference null return") Fixes: 43ed15e1ee01 ("ath11k: put hw to DBS using WMI_PDEV_SET_HW_MODE_CMDID") Signed-off-by: Colin Ian King Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20201214232417.84556-1-colin.king@canonical.com --- drivers/net/wireless/ath/ath11k/wmi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c index da4b546b62cb..73869d445c5b 100644 --- a/drivers/net/wireless/ath/ath11k/wmi.c +++ b/drivers/net/wireless/ath/ath11k/wmi.c @@ -3460,6 +3460,9 @@ int ath11k_wmi_set_hw_mode(struct ath11k_base *ab, len = sizeof(*cmd); skb = ath11k_wmi_alloc_skb(wmi_ab, len); + if (!skb) + return -ENOMEM; + cmd = (struct wmi_pdev_set_hw_mode_cmd_param *)skb->data; cmd->tlv_header = FIELD_PREP(WMI_TLV_TAG, WMI_TAG_PDEV_SET_HW_MODE_CMD) | From 3597010630d0aa96f5778901e691c6068bb86318 Mon Sep 17 00:00:00 2001 From: Carl Huang Date: Fri, 11 Dec 2020 00:56:13 -0500 Subject: [PATCH 047/547] ath11k: fix crash caused by NULL rx_channel During connect and disconnect stress test, crashed happened because ar->rx_channel is NULL. Fix it by checking whether ar->rx_channel is NULL. Crash stack is as below: RIP: 0010:ath11k_dp_rx_h_ppdu+0x110/0x230 [ath11k] [ 5028.808963] ath11k_dp_rx_wbm_err+0x14a/0x360 [ath11k] [ 5028.808970] ath11k_dp_rx_process_wbm_err+0x41c/0x520 [ath11k] [ 5028.808978] ath11k_dp_service_srng+0x25e/0x2d0 [ath11k] [ 5028.808982] ath11k_pci_ext_grp_napi_poll+0x23/0x80 [ath11k_pci] [ 5028.808986] net_rx_action+0x27e/0x400 [ 5028.808990] __do_softirq+0xfd/0x2bb [ 5028.808993] irq_exit+0xa6/0xb0 [ 5028.808995] do_IRQ+0x56/0xe0 [ 5028.808997] common_interrupt+0xf/0xf Tested-on: QCA6390 hw2.0 PCI WLAN.HST.1.0.1-01740-QCAHSTSWPLZ_V2_TO_X86-1 Signed-off-by: Carl Huang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20201211055613.9310-1-cjhuang@codeaurora.org --- drivers/net/wireless/ath/ath11k/dp_rx.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/dp_rx.c b/drivers/net/wireless/ath/ath11k/dp_rx.c index 205c0f1a40e9..920e5026a635 100644 --- a/drivers/net/wireless/ath/ath11k/dp_rx.c +++ b/drivers/net/wireless/ath/ath11k/dp_rx.c @@ -2294,6 +2294,7 @@ static void ath11k_dp_rx_h_ppdu(struct ath11k *ar, struct hal_rx_desc *rx_desc, { u8 channel_num; u32 center_freq; + struct ieee80211_channel *channel; rx_status->freq = 0; rx_status->rate_idx = 0; @@ -2314,9 +2315,12 @@ static void ath11k_dp_rx_h_ppdu(struct ath11k *ar, struct hal_rx_desc *rx_desc, rx_status->band = NL80211_BAND_5GHZ; } else { spin_lock_bh(&ar->data_lock); - rx_status->band = ar->rx_channel->band; - channel_num = - ieee80211_frequency_to_channel(ar->rx_channel->center_freq); + channel = ar->rx_channel; + if (channel) { + rx_status->band = channel->band; + channel_num = + ieee80211_frequency_to_channel(channel->center_freq); + } spin_unlock_bh(&ar->data_lock); ath11k_dbg_dump(ar->ab, ATH11K_DBG_DATA, NULL, "rx_desc: ", rx_desc, sizeof(struct hal_rx_desc)); From aa44b2f3ecd41f90b7e477158036648a49d21a32 Mon Sep 17 00:00:00 2001 From: Carl Huang Date: Fri, 11 Dec 2020 00:13:58 -0500 Subject: [PATCH 048/547] ath11k: start vdev if a bss peer is already created For QCA6390, bss peer must be created before vdev is to start. This change is to start vdev if a bss peer is created. Otherwise, ath11k delays to start vdev. This fixes an issue in a case where HT/VHT/HE settings change between authentication and association, e.g., due to the user space request to disable HT. Tested-on: QCA6390 hw2.0 PCI WLAN.HST.1.0.1-01740-QCAHSTSWPLZ_V2_TO_X86-1 Signed-off-by: Carl Huang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20201211051358.9191-1-cjhuang@codeaurora.org --- drivers/net/wireless/ath/ath11k/mac.c | 8 ++++++-- drivers/net/wireless/ath/ath11k/peer.c | 17 +++++++++++++++++ drivers/net/wireless/ath/ath11k/peer.h | 2 ++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 5c175e3e09b2..c1608f64ea95 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -3021,6 +3021,7 @@ static int ath11k_mac_station_add(struct ath11k *ar, } if (ab->hw_params.vdev_start_delay && + !arvif->is_started && arvif->vdev_type != WMI_VDEV_TYPE_AP) { ret = ath11k_start_vdev_delay(ar->hw, vif); if (ret) { @@ -5284,7 +5285,8 @@ ath11k_mac_op_assign_vif_chanctx(struct ieee80211_hw *hw, /* for QCA6390 bss peer must be created before vdev_start */ if (ab->hw_params.vdev_start_delay && arvif->vdev_type != WMI_VDEV_TYPE_AP && - arvif->vdev_type != WMI_VDEV_TYPE_MONITOR) { + arvif->vdev_type != WMI_VDEV_TYPE_MONITOR && + !ath11k_peer_find_by_vdev_id(ab, arvif->vdev_id)) { memcpy(&arvif->chanctx, ctx, sizeof(*ctx)); ret = 0; goto out; @@ -5295,7 +5297,9 @@ ath11k_mac_op_assign_vif_chanctx(struct ieee80211_hw *hw, goto out; } - if (ab->hw_params.vdev_start_delay) { + if (ab->hw_params.vdev_start_delay && + (arvif->vdev_type == WMI_VDEV_TYPE_AP || + arvif->vdev_type == WMI_VDEV_TYPE_MONITOR)) { param.vdev_id = arvif->vdev_id; param.peer_type = WMI_PEER_TYPE_DEFAULT; param.peer_addr = ar->mac_addr; diff --git a/drivers/net/wireless/ath/ath11k/peer.c b/drivers/net/wireless/ath/ath11k/peer.c index 1866d82678fa..b69e7ebfa930 100644 --- a/drivers/net/wireless/ath/ath11k/peer.c +++ b/drivers/net/wireless/ath/ath11k/peer.c @@ -76,6 +76,23 @@ struct ath11k_peer *ath11k_peer_find_by_id(struct ath11k_base *ab, return NULL; } +struct ath11k_peer *ath11k_peer_find_by_vdev_id(struct ath11k_base *ab, + int vdev_id) +{ + struct ath11k_peer *peer; + + spin_lock_bh(&ab->base_lock); + + list_for_each_entry(peer, &ab->peers, list) { + if (vdev_id == peer->vdev_id) { + spin_unlock_bh(&ab->base_lock); + return peer; + } + } + spin_unlock_bh(&ab->base_lock); + return NULL; +} + void ath11k_peer_unmap_event(struct ath11k_base *ab, u16 peer_id) { struct ath11k_peer *peer; diff --git a/drivers/net/wireless/ath/ath11k/peer.h b/drivers/net/wireless/ath/ath11k/peer.h index bba2e00b6944..8553ed061aea 100644 --- a/drivers/net/wireless/ath/ath11k/peer.h +++ b/drivers/net/wireless/ath/ath11k/peer.h @@ -43,5 +43,7 @@ int ath11k_peer_create(struct ath11k *ar, struct ath11k_vif *arvif, struct ieee80211_sta *sta, struct peer_create_params *param); int ath11k_wait_for_peer_delete_done(struct ath11k *ar, u32 vdev_id, const u8 *addr); +struct ath11k_peer *ath11k_peer_find_by_vdev_id(struct ath11k_base *ab, + int vdev_id); #endif /* _PEER_H_ */ From 9b09456258ea2f35fc8a99c4ac4829dcba0ca4be Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 16 Dec 2020 11:31:19 +0300 Subject: [PATCH 049/547] ath11k: Fix error code in ath11k_core_suspend() The "if (!ret)" condition is inverted and it should be "if (ret)". It means that we return success when we had intended to return an error code. This also caused a spurious warning even when the suspend was successful: [ 297.186612] ath11k_pci 0000:06:00.0: failed to suspend hif: 0 Fixes: d1b0c33850d2 ("ath11k: implement suspend for QCA6390 PCI devices") Signed-off-by: Dan Carpenter Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/X9nF17L2/EKOSbn/@mwanda --- drivers/net/wireless/ath/ath11k/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath11k/core.c b/drivers/net/wireless/ath/ath11k/core.c index b97c38b9a270..350b7913622c 100644 --- a/drivers/net/wireless/ath/ath11k/core.c +++ b/drivers/net/wireless/ath/ath11k/core.c @@ -185,7 +185,7 @@ int ath11k_core_suspend(struct ath11k_base *ab) ath11k_hif_ce_irq_disable(ab); ret = ath11k_hif_suspend(ab); - if (!ret) { + if (ret) { ath11k_warn(ab, "failed to suspend hif: %d\n", ret); return ret; } From 30d085039314fcad2c2e33a2dfc8e79765ddf408 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 16 Dec 2020 11:32:12 +0300 Subject: [PATCH 050/547] ath11k: Fix ath11k_pci_fix_l1ss() All these conditions are reversed so presumably most of the function is dead code. This caused a spurious warning: [ 95.734922] ath11k_pci 0000:06:00.0: failed to set sysclk: 0 Fixes: 0699940755e9 ("ath11k: pci: fix L1ss clock unstable problem") Signed-off-by: Dan Carpenter Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/X9nGDHiTh+Z+asDy@mwanda --- drivers/net/wireless/ath/ath11k/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/pci.c b/drivers/net/wireless/ath/ath11k/pci.c index 857647aa57c8..9f9a824a4c2d 100644 --- a/drivers/net/wireless/ath/ath11k/pci.c +++ b/drivers/net/wireless/ath/ath11k/pci.c @@ -274,7 +274,7 @@ static int ath11k_pci_fix_l1ss(struct ath11k_base *ab) PCIE_QSERDES_COM_SYSCLK_EN_SEL_REG, PCIE_QSERDES_COM_SYSCLK_EN_SEL_VAL, PCIE_QSERDES_COM_SYSCLK_EN_SEL_MSK); - if (!ret) { + if (ret) { ath11k_warn(ab, "failed to set sysclk: %d\n", ret); return ret; } @@ -283,7 +283,7 @@ static int ath11k_pci_fix_l1ss(struct ath11k_base *ab) PCIE_USB3_PCS_MISC_OSC_DTCT_CONFIG1_REG, PCIE_USB3_PCS_MISC_OSC_DTCT_CONFIG1_VAL, PCIE_USB3_PCS_MISC_OSC_DTCT_CONFIG_MSK); - if (!ret) { + if (ret) { ath11k_warn(ab, "failed to set dtct config1 error: %d\n", ret); return ret; } @@ -292,7 +292,7 @@ static int ath11k_pci_fix_l1ss(struct ath11k_base *ab) PCIE_USB3_PCS_MISC_OSC_DTCT_CONFIG2_REG, PCIE_USB3_PCS_MISC_OSC_DTCT_CONFIG2_VAL, PCIE_USB3_PCS_MISC_OSC_DTCT_CONFIG_MSK); - if (!ret) { + if (ret) { ath11k_warn(ab, "failed to set dtct config2: %d\n", ret); return ret; } @@ -301,7 +301,7 @@ static int ath11k_pci_fix_l1ss(struct ath11k_base *ab) PCIE_USB3_PCS_MISC_OSC_DTCT_CONFIG4_REG, PCIE_USB3_PCS_MISC_OSC_DTCT_CONFIG4_VAL, PCIE_USB3_PCS_MISC_OSC_DTCT_CONFIG_MSK); - if (!ret) { + if (ret) { ath11k_warn(ab, "failed to set dtct config4: %d\n", ret); return ret; } From e7f6f893ac39c8715d959ff8d677645ef5e0f8b4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 8 Dec 2020 10:20:15 +0100 Subject: [PATCH 051/547] mt76: mt76u: fix NULL pointer dereference in mt76u_status_worker Fix the following NULL pointer dereference in mt76u_status_worker that can occur if status thread runs before allocating tx queues [ 31.395373] BUG: kernel NULL pointer dereference, address: 000000000000002c [ 31.395769] #PF: supervisor read access in kernel mode [ 31.395985] #PF: error_code(0x0000) - not-present page [ 31.396178] PGD 0 P4D 0 [ 31.396277] Oops: 0000 [#1] SMP [ 31.396430] CPU: 3 PID: 337 Comm: mt76-usb-status Not tainted 5.10.0-rc1-kvm+ #49 [ 31.396703] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-3.fc33 04/01/2014 [ 31.397048] RIP: 0010:mt76u_status_worker+0x2b/0x190 [ 31.397931] RSP: 0018:ffffc9000076fe98 EFLAGS: 00010282 [ 31.398118] RAX: 0000000000000001 RBX: ffff888111203fe8 RCX: 0000000000000000 [ 31.398400] RDX: 0000000000000001 RSI: 0000000000000246 RDI: ffff888111203fe8 [ 31.398668] RBP: ffff888111201d00 R08: 000000000000038c R09: 000000000000009b [ 31.398952] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 31.399235] R13: 0000000000000000 R14: 0000000000000000 R15: ffff88810c987300 [ 31.399494] FS: 0000000000000000(0000) GS:ffff88817bd80000(0000) knlGS:0000000000000000 [ 31.399767] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 31.399991] CR2: 000000000000002c CR3: 0000000103525000 CR4: 00000000000006a0 [ 31.400236] Call Trace: [ 31.400348] ? schedule+0x3e/0xa0 [ 31.400514] __mt76_worker_fn+0x71/0xa0 [ 31.400634] ? mt76_get_min_avg_rssi+0x110/0x110 [ 31.400827] kthread+0x118/0x130 [ 31.400984] ? __kthread_bind_mask+0x60/0x60 [ 31.401212] ret_from_fork+0x1f/0x30 [ 31.401353] Modules linked in: [ 31.401480] CR2: 000000000000002c [ 31.401627] ---[ end trace 8bf174505cc34851 ]--- [ 31.401798] RIP: 0010:mt76u_status_worker+0x2b/0x190 [ 31.402636] RSP: 0018:ffffc9000076fe98 EFLAGS: 00010282 [ 31.402829] RAX: 0000000000000001 RBX: ffff888111203fe8 RCX: 0000000000000000 [ 31.403118] RDX: 0000000000000001 RSI: 0000000000000246 RDI: ffff888111203fe8 [ 31.403424] RBP: ffff888111201d00 R08: 000000000000038c R09: 000000000000009b [ 31.403689] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 31.403933] R13: 0000000000000000 R14: 0000000000000000 R15: ffff88810c987300 [ 31.404209] FS: 0000000000000000(0000) GS:ffff88817bd80000(0000) knlGS:0000000000000000 [ 31.404482] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 31.404726] CR2: 000000000000002c CR3: 0000000103525000 CR4: 00000000000006a0 [ 31.405294] mt76x0u: probe of 1-1:1.0 failed with error -110 [ 31.406007] usb 1-1: USB disconnect, device number 2 [ 31.840756] usb 1-1: new high-speed USB device number 3 using xhci_hcd [ 32.461295] usb 1-1: reset high-speed USB device number 3 using xhci_hcd [ 32.659932] mt76x0u 1-1:1.0: ASIC revision: 76100002 MAC revision: 76502000 [ 33.197032] mt76x0u 1-1:1.0: EEPROM ver:02 fae:01 Fixes: 9daf27e62852 ("mt76: mt76u: use dedicated thread for status work") Signed-off-by: Lorenzo Bianconi Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/cd44dc407cf3e5f27688105d4a75fb1c68e62b06.1607419147.git.lorenzo@kernel.org --- drivers/net/wireless/mediatek/mt76/usb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c index dc850109de22..8b08cad131c8 100644 --- a/drivers/net/wireless/mediatek/mt76/usb.c +++ b/drivers/net/wireless/mediatek/mt76/usb.c @@ -816,6 +816,8 @@ static void mt76u_status_worker(struct mt76_worker *w) for (i = 0; i < IEEE80211_NUM_ACS; i++) { q = dev->phy.q_tx[i]; + if (!q) + continue; while (q->queued > 0) { if (!q->entry[q->tail].done) From 4dfde294b9792dcf8615b55c58f093d544f472f0 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 14 Dec 2020 13:31:06 +0800 Subject: [PATCH 052/547] rtlwifi: rise completion at the last step of firmware callback request_firmware_nowait() which schedules another work is used to load firmware when USB is probing. If USB is unplugged before running the firmware work, it goes disconnect ops, and then causes use-after-free. Though we wait for completion of firmware work before freeing the hw, firmware callback rises completion too early. So I move it to the last step. usb 5-1: Direct firmware load for rtlwifi/rtl8192cufw.bin failed with error -2 rtlwifi: Loading alternative firmware rtlwifi/rtl8192cufw.bin rtlwifi: Selected firmware is not available ================================================================== BUG: KASAN: use-after-free in rtl_fw_do_work.cold+0x68/0x6a drivers/net/wireless/realtek/rtlwifi/core.c:93 Write of size 4 at addr ffff8881454cff50 by task kworker/0:6/7379 CPU: 0 PID: 7379 Comm: kworker/0:6 Not tainted 5.10.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events request_firmware_work_func Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x4c8 mm/kasan/report.c:385 __kasan_report mm/kasan/report.c:545 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:562 rtl_fw_do_work.cold+0x68/0x6a drivers/net/wireless/realtek/rtlwifi/core.c:93 request_firmware_work_func+0x12c/0x230 drivers/base/firmware_loader/main.c:1079 process_one_work+0x933/0x1520 kernel/workqueue.c:2272 worker_thread+0x64c/0x1120 kernel/workqueue.c:2418 kthread+0x38c/0x460 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:296 The buggy address belongs to the page: page:00000000f54435b3 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1454cf flags: 0x200000000000000() raw: 0200000000000000 0000000000000000 ffffea00051533c8 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8881454cfe00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8881454cfe80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff8881454cff00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff8881454cff80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8881454d0000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff Reported-by: syzbot+65be4277f3c489293939@syzkaller.appspotmail.com Signed-off-by: Ping-Ke Shih Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20201214053106.7748-1-pkshih@realtek.com --- drivers/net/wireless/realtek/rtlwifi/core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c index a7259dbc953d..965bd9589045 100644 --- a/drivers/net/wireless/realtek/rtlwifi/core.c +++ b/drivers/net/wireless/realtek/rtlwifi/core.c @@ -78,7 +78,6 @@ static void rtl_fw_do_work(const struct firmware *firmware, void *context, rtl_dbg(rtlpriv, COMP_ERR, DBG_LOUD, "Firmware callback routine entered!\n"); - complete(&rtlpriv->firmware_loading_complete); if (!firmware) { if (rtlpriv->cfg->alt_fw_name) { err = request_firmware(&firmware, @@ -91,13 +90,13 @@ static void rtl_fw_do_work(const struct firmware *firmware, void *context, } pr_err("Selected firmware is not available\n"); rtlpriv->max_fw_size = 0; - return; + goto exit; } found_alt: if (firmware->size > rtlpriv->max_fw_size) { pr_err("Firmware is too big!\n"); release_firmware(firmware); - return; + goto exit; } if (!is_wow) { memcpy(rtlpriv->rtlhal.pfirmware, firmware->data, @@ -109,6 +108,9 @@ found_alt: rtlpriv->rtlhal.wowlan_fwsize = firmware->size; } release_firmware(firmware); + +exit: + complete(&rtlpriv->firmware_loading_complete); } void rtl_fw_cb(const struct firmware *firmware, void *context) From 443d6e86f821a165fae3fc3fc13086d27ac140b1 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 16 Dec 2020 21:38:02 -0700 Subject: [PATCH 053/547] netfilter: x_tables: Update remaining dereference to RCU This fixes the dereference to fetch the RCU pointer when holding the appropriate xtables lock. Reported-by: kernel test robot Fixes: cc00bcaa5899 ("netfilter: x_tables: Switch synchronization to RCU") Signed-off-by: Subash Abhinov Kasiviswanathan Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 2 +- net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 563b62b76a5f..c576a63d09db 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1379,7 +1379,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); struct xt_table_info info; ret = compat_table_info(private, &info); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 6e2851f8d3a3..e8f6f9d86237 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1589,7 +1589,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index c4f532f4d311..0d453fa9e327 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1598,7 +1598,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) From 2b33d6ffa9e38f344418976b06057e2fc2aa9e2a Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 17 Dec 2020 11:53:40 +0300 Subject: [PATCH 054/547] netfilter: ipset: fixes possible oops in mtype_resize currently mtype_resize() can cause oops t = ip_set_alloc(htable_size(htable_bits)); if (!t) { ret = -ENOMEM; goto out; } t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits)); Increased htable_bits can force htable_size() to return 0. In own turn ip_set_alloc(0) returns not 0 but ZERO_SIZE_PTR, so follwoing access to t->hregion should trigger an OOPS. Signed-off-by: Vasily Averin Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 5f1208ad049e..1e7dddf824ae 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -640,7 +640,7 @@ mtype_resize(struct ip_set *set, bool retried) struct htype *h = set->data; struct htable *t, *orig; u8 htable_bits; - size_t dsize = set->dsize; + size_t hsize, dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; struct mtype_elem *tmp; @@ -664,14 +664,12 @@ mtype_resize(struct ip_set *set, bool retried) retry: ret = 0; htable_bits++; - if (!htable_bits) { - /* In case we have plenty of memory :-) */ - pr_warn("Cannot increase the hashsize of set %s further\n", - set->name); - ret = -IPSET_ERR_HASH_FULL; - goto out; - } - t = ip_set_alloc(htable_size(htable_bits)); + if (!htable_bits) + goto hbwarn; + hsize = htable_size(htable_bits); + if (!hsize) + goto hbwarn; + t = ip_set_alloc(hsize); if (!t) { ret = -ENOMEM; goto out; @@ -813,6 +811,12 @@ cleanup: if (ret == -EAGAIN) goto retry; goto out; + +hbwarn: + /* In case we have plenty of memory :-) */ + pr_warn("Cannot increase the hashsize of set %s further\n", set->name); + ret = -IPSET_ERR_HASH_FULL; + goto out; } /* Get the current number of elements and ext_size in the set */ From 5c8193f568ae16f3242abad6518dc2ca6c8eef86 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 17 Dec 2020 17:53:18 +0300 Subject: [PATCH 055/547] netfilter: ipset: fix shift-out-of-bounds in htable_bits() htable_bits() can call jhash_size(32) and trigger shift-out-of-bounds UBSAN: shift-out-of-bounds in net/netfilter/ipset/ip_set_hash_gen.h:151:6 shift exponent 32 is too large for 32-bit type 'unsigned int' CPU: 0 PID: 8498 Comm: syz-executor519 Not tainted 5.10.0-rc7-next-20201208-syzkaller #0 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:395 htable_bits net/netfilter/ipset/ip_set_hash_gen.h:151 [inline] hash_mac_create.cold+0x58/0x9b net/netfilter/ipset/ip_set_hash_gen.h:1524 ip_set_create+0x610/0x1380 net/netfilter/ipset/ip_set_core.c:1115 nfnetlink_rcv_msg+0xecc/0x1180 net/netfilter/nfnetlink.c:252 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 nfnetlink_rcv+0x1ac/0x420 net/netfilter/nfnetlink.c:600 netlink_unicast_kernel net/netlink/af_netlink.c:1304 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1330 netlink_sendmsg+0x907/0xe40 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:672 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2345 ___sys_sendmsg+0xf3/0x170 net/socket.c:2399 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2432 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This patch replaces htable_bits() by simple fls(hashsize - 1) call: it alone returns valid nbits both for round and non-round hashsizes. It is normal to set any nbits here because it is validated inside following htable_size() call which returns 0 for nbits>31. Fixes: 1feab10d7e6d("netfilter: ipset: Unified hash type generation") Reported-by: syzbot+d66bfadebca46cf61a2b@syzkaller.appspotmail.com Signed-off-by: Vasily Averin Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 1e7dddf824ae..6186358eac7c 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -141,20 +141,6 @@ htable_size(u8 hbits) return hsize * sizeof(struct hbucket *) + sizeof(struct htable); } -/* Compute htable_bits from the user input parameter hashsize */ -static u8 -htable_bits(u32 hashsize) -{ - /* Assume that hashsize == 2^htable_bits */ - u8 bits = fls(hashsize - 1); - - if (jhash_size(bits) != hashsize) - /* Round up to the first 2^n value */ - bits = fls(hashsize); - - return bits; -} - #ifdef IP_SET_HASH_WITH_NETS #if IPSET_NET_COUNT > 1 #define __CIDR(cidr, i) (cidr[i]) @@ -1525,7 +1511,11 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (!h) return -ENOMEM; - hbits = htable_bits(hashsize); + /* Compute htable_bits from the user input parameter hashsize. + * Assume that hashsize == 2^htable_bits, + * otherwise round up to the first 2^n value. + */ + hbits = fls(hashsize - 1); hsize = htable_size(hbits); if (hsize == 0) { kfree(h); From 3ac874fa84d1baaf0c0175f2a1499f5d88d528b2 Mon Sep 17 00:00:00 2001 From: Sylwester Dziedziuch Date: Thu, 22 Oct 2020 12:39:36 +0200 Subject: [PATCH 056/547] i40e: Fix Error I40E_AQ_RC_EINVAL when removing VFs When removing VFs for PF added to bridge there was an error I40E_AQ_RC_EINVAL. It was caused by not properly resetting and reinitializing PF when adding/removing VFs. Changed how reset is performed when adding/removing VFs to properly reinitialize PFs VSI. Fixes: fc60861e9b00 ("i40e: start up in VEPA mode by default") Signed-off-by: Sylwester Dziedziuch Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 3 +++ drivers/net/ethernet/intel/i40e/i40e_main.c | 10 ++++++++++ drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 4 ++-- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index d231a2cdd98f..118473dfdcbd 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -120,6 +120,7 @@ enum i40e_state_t { __I40E_RESET_INTR_RECEIVED, __I40E_REINIT_REQUESTED, __I40E_PF_RESET_REQUESTED, + __I40E_PF_RESET_AND_REBUILD_REQUESTED, __I40E_CORE_RESET_REQUESTED, __I40E_GLOBAL_RESET_REQUESTED, __I40E_EMP_RESET_INTR_RECEIVED, @@ -146,6 +147,8 @@ enum i40e_state_t { }; #define I40E_PF_RESET_FLAG BIT_ULL(__I40E_PF_RESET_REQUESTED) +#define I40E_PF_RESET_AND_REBUILD_FLAG \ + BIT_ULL(__I40E_PF_RESET_AND_REBUILD_REQUESTED) /* VSI state flags */ enum i40e_vsi_state_t { diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 1337686bd099..1db482d310c2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -36,6 +36,8 @@ static int i40e_setup_misc_vector(struct i40e_pf *pf); static void i40e_determine_queue_usage(struct i40e_pf *pf); static int i40e_setup_pf_filter_control(struct i40e_pf *pf); static void i40e_prep_for_reset(struct i40e_pf *pf, bool lock_acquired); +static void i40e_reset_and_rebuild(struct i40e_pf *pf, bool reinit, + bool lock_acquired); static int i40e_reset(struct i40e_pf *pf); static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired); static int i40e_setup_misc_vector_for_recovery_mode(struct i40e_pf *pf); @@ -8536,6 +8538,14 @@ void i40e_do_reset(struct i40e_pf *pf, u32 reset_flags, bool lock_acquired) "FW LLDP is disabled\n" : "FW LLDP is enabled\n"); + } else if (reset_flags & I40E_PF_RESET_AND_REBUILD_FLAG) { + /* Request a PF Reset + * + * Resets PF and reinitializes PFs VSI. + */ + i40e_prep_for_reset(pf, lock_acquired); + i40e_reset_and_rebuild(pf, true, lock_acquired); + } else if (reset_flags & BIT_ULL(__I40E_REINIT_REQUESTED)) { int v; diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 729c4f0d5ac5..21ee56420c3a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -1772,7 +1772,7 @@ int i40e_pci_sriov_configure(struct pci_dev *pdev, int num_vfs) if (num_vfs) { if (!(pf->flags & I40E_FLAG_VEB_MODE_ENABLED)) { pf->flags |= I40E_FLAG_VEB_MODE_ENABLED; - i40e_do_reset_safe(pf, I40E_PF_RESET_FLAG); + i40e_do_reset_safe(pf, I40E_PF_RESET_AND_REBUILD_FLAG); } ret = i40e_pci_sriov_enable(pdev, num_vfs); goto sriov_configure_out; @@ -1781,7 +1781,7 @@ int i40e_pci_sriov_configure(struct pci_dev *pdev, int num_vfs) if (!pci_vfs_assigned(pf->pdev)) { i40e_free_vfs(pf); pf->flags &= ~I40E_FLAG_VEB_MODE_ENABLED; - i40e_do_reset_safe(pf, I40E_PF_RESET_FLAG); + i40e_do_reset_safe(pf, I40E_PF_RESET_AND_REBUILD_FLAG); } else { dev_warn(&pdev->dev, "Unable to free VFs because some are assigned to VMs.\n"); ret = -EINVAL; From 8bee683384087a6275c9183a483435225f7bb209 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 14 Dec 2020 09:51:27 +0100 Subject: [PATCH 057/547] xsk: Fix memory leak for failed bind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a possible memory leak when a bind of an AF_XDP socket fails. When the fill and completion rings are created, they are tied to the socket. But when the buffer pool is later created at bind time, the ownership of these two rings are transferred to the buffer pool as they might be shared between sockets (and the buffer pool cannot be created until we know what we are binding to). So, before the buffer pool is created, these two rings are cleaned up with the socket, and after they have been transferred they are cleaned up together with the buffer pool. The problem is that ownership was transferred before it was absolutely certain that the buffer pool could be created and initialized correctly and when one of these errors occurred, the fill and completion rings did neither belong to the socket nor the pool and where therefore leaked. Solve this by moving the ownership transfer to the point where the buffer pool has been completely set up and there is no way it can fail. Fixes: 7361f9c3d719 ("xsk: Move fill and completion rings to buffer pool") Reported-by: syzbot+cfa88ddd0655afa88763@syzkaller.appspotmail.com Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20201214085127.3960-1-magnus.karlsson@gmail.com --- net/xdp/xsk.c | 4 ++++ net/xdp/xsk_buff_pool.c | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index ac4a317038f1..c6532d77fde7 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -878,6 +878,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } } + /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ + xs->fq_tmp = NULL; + xs->cq_tmp = NULL; + xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 67a4494d63b6..818b75060922 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -75,8 +75,6 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->fq = xs->fq_tmp; pool->cq = xs->cq_tmp; - xs->fq_tmp = NULL; - xs->cq_tmp = NULL; for (i = 0; i < pool->free_heads_cnt; i++) { xskb = &pool->heads[i]; From f1340265726e0edf8a8cef28e665b28ad6302ce9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Dec 2020 18:18:06 -0800 Subject: [PATCH 058/547] iavf: fix double-release of rtnl_lock This code does not jump to exit on an error in iavf_lan_add_device(), so the rtnl_unlock() from the normal path will follow. Fixes: b66c7bc1cd4d ("iavf: Refactor init state machine") Signed-off-by: Jakub Kicinski Reviewed-by: Tony Nguyen Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 95543dfd4fe7..0a867d64d467 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1834,11 +1834,9 @@ static int iavf_init_get_resources(struct iavf_adapter *adapter) netif_tx_stop_all_queues(netdev); if (CLIENT_ALLOWED(adapter)) { err = iavf_lan_add_device(adapter); - if (err) { - rtnl_unlock(); + if (err) dev_info(&pdev->dev, "Failed to add VF to client API service list: %d\n", err); - } } dev_info(&pdev->dev, "MAC address: %pM\n", adapter->hw.mac.addr); if (netdev->features & NETIF_F_GRO) From f6f92968e1e5a7a9d211faaebefc26ebe408dad7 Mon Sep 17 00:00:00 2001 From: Carl Huang Date: Thu, 17 Dec 2020 09:04:57 +0200 Subject: [PATCH 059/547] ath11k: qmi: try to allocate a big block of DMA memory first Not all firmware versions support allocating DMA memory in smaller blocks so first try to allocate big block of DMA memory for QMI. If the allocation fails, let firmware request multiple blocks of DMA memory with smaller size. This also fixes an unnecessary error message seen during ath11k probe on QCA6390: ath11k_pci 0000:06:00.0: Respond mem req failed, result: 1, err: 0 ath11k_pci 0000:06:00.0: qmi failed to respond fw mem req:-22 Tested-on: QCA6390 hw2.0 PCI WLAN.HST.1.0.1-01740-QCAHSTSWPLZ_V2_TO_X86-1 Signed-off-by: Carl Huang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1608127593-15192-1-git-send-email-kvalo@codeaurora.org --- drivers/net/wireless/ath/ath11k/qmi.c | 24 ++++++++++++++++++++++-- drivers/net/wireless/ath/ath11k/qmi.h | 1 + 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath11k/qmi.c b/drivers/net/wireless/ath/ath11k/qmi.c index f0b5c50974f3..0db623ff4bb9 100644 --- a/drivers/net/wireless/ath/ath11k/qmi.c +++ b/drivers/net/wireless/ath/ath11k/qmi.c @@ -1660,6 +1660,7 @@ static int ath11k_qmi_respond_fw_mem_request(struct ath11k_base *ab) struct qmi_wlanfw_respond_mem_resp_msg_v01 resp; struct qmi_txn txn = {}; int ret = 0, i; + bool delayed; req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) @@ -1672,11 +1673,13 @@ static int ath11k_qmi_respond_fw_mem_request(struct ath11k_base *ab) * failure to FW and FW will then request mulitple blocks of small * chunk size memory. */ - if (!ab->bus_params.fixed_mem_region && ab->qmi.mem_seg_count <= 2) { + if (!ab->bus_params.fixed_mem_region && ab->qmi.target_mem_delayed) { + delayed = true; ath11k_dbg(ab, ATH11K_DBG_QMI, "qmi delays mem_request %d\n", ab->qmi.mem_seg_count); memset(req, 0, sizeof(*req)); } else { + delayed = false; req->mem_seg_len = ab->qmi.mem_seg_count; for (i = 0; i < req->mem_seg_len ; i++) { @@ -1708,6 +1711,12 @@ static int ath11k_qmi_respond_fw_mem_request(struct ath11k_base *ab) } if (resp.resp.result != QMI_RESULT_SUCCESS_V01) { + /* the error response is expected when + * target_mem_delayed is true. + */ + if (delayed && resp.resp.error == 0) + goto out; + ath11k_warn(ab, "Respond mem req failed, result: %d, err: %d\n", resp.resp.result, resp.resp.error); ret = -EINVAL; @@ -1742,6 +1751,8 @@ static int ath11k_qmi_alloc_target_mem_chunk(struct ath11k_base *ab) int i; struct target_mem_chunk *chunk; + ab->qmi.target_mem_delayed = false; + for (i = 0; i < ab->qmi.mem_seg_count; i++) { chunk = &ab->qmi.target_mem[i]; chunk->vaddr = dma_alloc_coherent(ab->dev, @@ -1749,6 +1760,15 @@ static int ath11k_qmi_alloc_target_mem_chunk(struct ath11k_base *ab) &chunk->paddr, GFP_KERNEL); if (!chunk->vaddr) { + if (ab->qmi.mem_seg_count <= 2) { + ath11k_dbg(ab, ATH11K_DBG_QMI, + "qmi dma allocation failed (%d B type %u), will try later with small size\n", + chunk->size, + chunk->type); + ath11k_qmi_free_target_mem_chunk(ab); + ab->qmi.target_mem_delayed = true; + return 0; + } ath11k_err(ab, "failed to alloc memory, size: 0x%x, type: %u\n", chunk->size, chunk->type); @@ -2517,7 +2537,7 @@ static void ath11k_qmi_msg_mem_request_cb(struct qmi_handle *qmi_hdl, ret); return; } - } else if (msg->mem_seg_len > 2) { + } else { ret = ath11k_qmi_alloc_target_mem_chunk(ab); if (ret) { ath11k_warn(ab, "qmi failed to alloc target memory: %d\n", diff --git a/drivers/net/wireless/ath/ath11k/qmi.h b/drivers/net/wireless/ath/ath11k/qmi.h index 92925c9eac67..7bad374cc23a 100644 --- a/drivers/net/wireless/ath/ath11k/qmi.h +++ b/drivers/net/wireless/ath/ath11k/qmi.h @@ -125,6 +125,7 @@ struct ath11k_qmi { struct target_mem_chunk target_mem[ATH11K_QMI_WLANFW_MAX_NUM_MEM_SEG_V01]; u32 mem_seg_count; u32 target_mem_mode; + bool target_mem_delayed; u8 cal_done; struct target_info target; struct m3_mem_region m3_mem; From e9603f4bdcc04417f1c7b3585e63654819dc11f6 Mon Sep 17 00:00:00 2001 From: Carl Huang Date: Thu, 17 Dec 2020 17:22:10 +0200 Subject: [PATCH 060/547] ath11k: pci: disable ASPM L0sLs before downloading firmware Sometimes QCA6390 doesn't switch to amss state as device enters L1ss state, so disable L0sL1s during firmware downloading. Driver recovers the ASPM to default value in start callback or powerdown callback. Tested-on: QCA6390 hw2.0 PCI WLAN.HST.1.0.1-01740-QCAHSTSWPLZ_V2_TO_X86-1 Signed-off-by: Carl Huang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1608218530-15426-1-git-send-email-kvalo@codeaurora.org --- drivers/net/wireless/ath/ath11k/pci.c | 36 +++++++++++++++++++++++++++ drivers/net/wireless/ath/ath11k/pci.h | 2 ++ 2 files changed, 38 insertions(+) diff --git a/drivers/net/wireless/ath/ath11k/pci.c b/drivers/net/wireless/ath/ath11k/pci.c index 9f9a824a4c2d..20b415cd96c4 100644 --- a/drivers/net/wireless/ath/ath11k/pci.c +++ b/drivers/net/wireless/ath/ath11k/pci.c @@ -886,6 +886,32 @@ static void ath11k_pci_free_region(struct ath11k_pci *ab_pci) pci_disable_device(pci_dev); } +static void ath11k_pci_aspm_disable(struct ath11k_pci *ab_pci) +{ + struct ath11k_base *ab = ab_pci->ab; + + pcie_capability_read_word(ab_pci->pdev, PCI_EXP_LNKCTL, + &ab_pci->link_ctl); + + ath11k_dbg(ab, ATH11K_DBG_PCI, "pci link_ctl 0x%04x L0s %d L1 %d\n", + ab_pci->link_ctl, + u16_get_bits(ab_pci->link_ctl, PCI_EXP_LNKCTL_ASPM_L0S), + u16_get_bits(ab_pci->link_ctl, PCI_EXP_LNKCTL_ASPM_L1)); + + /* disable L0s and L1 */ + pcie_capability_write_word(ab_pci->pdev, PCI_EXP_LNKCTL, + ab_pci->link_ctl & ~PCI_EXP_LNKCTL_ASPMC); + + set_bit(ATH11K_PCI_ASPM_RESTORE, &ab_pci->flags); +} + +static void ath11k_pci_aspm_restore(struct ath11k_pci *ab_pci) +{ + if (test_and_clear_bit(ATH11K_PCI_ASPM_RESTORE, &ab_pci->flags)) + pcie_capability_write_word(ab_pci->pdev, PCI_EXP_LNKCTL, + ab_pci->link_ctl); +} + static int ath11k_pci_power_up(struct ath11k_base *ab) { struct ath11k_pci *ab_pci = ath11k_pci_priv(ab); @@ -895,6 +921,11 @@ static int ath11k_pci_power_up(struct ath11k_base *ab) clear_bit(ATH11K_PCI_FLAG_INIT_DONE, &ab_pci->flags); ath11k_pci_sw_reset(ab_pci->ab, true); + /* Disable ASPM during firmware download due to problems switching + * to AMSS state. + */ + ath11k_pci_aspm_disable(ab_pci); + ret = ath11k_mhi_start(ab_pci); if (ret) { ath11k_err(ab, "failed to start mhi: %d\n", ret); @@ -908,6 +939,9 @@ static void ath11k_pci_power_down(struct ath11k_base *ab) { struct ath11k_pci *ab_pci = ath11k_pci_priv(ab); + /* restore aspm in case firmware bootup fails */ + ath11k_pci_aspm_restore(ab_pci); + ath11k_pci_force_wake(ab_pci->ab); ath11k_mhi_stop(ab_pci); clear_bit(ATH11K_PCI_FLAG_INIT_DONE, &ab_pci->flags); @@ -965,6 +999,8 @@ static int ath11k_pci_start(struct ath11k_base *ab) set_bit(ATH11K_PCI_FLAG_INIT_DONE, &ab_pci->flags); + ath11k_pci_aspm_restore(ab_pci); + ath11k_pci_ce_irqs_enable(ab); ath11k_ce_rx_post_buf(ab); diff --git a/drivers/net/wireless/ath/ath11k/pci.h b/drivers/net/wireless/ath/ath11k/pci.h index 0432a702416b..fe44d0dfce19 100644 --- a/drivers/net/wireless/ath/ath11k/pci.h +++ b/drivers/net/wireless/ath/ath11k/pci.h @@ -63,6 +63,7 @@ struct ath11k_msi_config { enum ath11k_pci_flags { ATH11K_PCI_FLAG_INIT_DONE, ATH11K_PCI_FLAG_IS_MSI_64, + ATH11K_PCI_ASPM_RESTORE, }; struct ath11k_pci { @@ -80,6 +81,7 @@ struct ath11k_pci { /* enum ath11k_pci_flags */ unsigned long flags; + u16 link_ctl; }; static inline struct ath11k_pci *ath11k_pci_priv(struct ath11k_base *ab) From 3d45f221ce627d13e2e6ef3274f06750c84a6542 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 2 Dec 2020 11:55:58 +0000 Subject: [PATCH 061/547] btrfs: fix deadlock when cloning inline extent and low on free metadata space When cloning an inline extent there are cases where we can not just copy the inline extent from the source range to the target range (e.g. when the target range starts at an offset greater than zero). In such cases we copy the inline extent's data into a page of the destination inode and then dirty that page. However, after that we will need to start a transaction for each processed extent and, if we are ever low on available metadata space, we may need to flush existing delalloc for all dirty inodes in an attempt to release metadata space - if that happens we may deadlock: * the async reclaim task queued a delalloc work to flush delalloc for the destination inode of the clone operation; * the task executing that delalloc work gets blocked waiting for the range with the dirty page to be unlocked, which is currently locked by the task doing the clone operation; * the async reclaim task blocks waiting for the delalloc work to complete; * the cloning task is waiting on the waitqueue of its reservation ticket while holding the range with the dirty page locked in the inode's io_tree; * if metadata space is not released by some other task (like delalloc for some other inode completing for example), the clone task waits forever and as a consequence the delalloc work and async reclaim tasks will hang forever as well. Releasing more space on the other hand may require starting a transaction, which will hang as well when trying to reserve metadata space, resulting in a deadlock between all these tasks. When this happens, traces like the following show up in dmesg/syslog: [87452.323003] INFO: task kworker/u16:11:1810830 blocked for more than 120 seconds. [87452.323644] Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 [87452.324248] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [87452.324852] task:kworker/u16:11 state:D stack: 0 pid:1810830 ppid: 2 flags:0x00004000 [87452.325520] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs] [87452.326136] Call Trace: [87452.326737] __schedule+0x5d1/0xcf0 [87452.327390] schedule+0x45/0xe0 [87452.328174] lock_extent_bits+0x1e6/0x2d0 [btrfs] [87452.328894] ? finish_wait+0x90/0x90 [87452.329474] btrfs_invalidatepage+0x32c/0x390 [btrfs] [87452.330133] ? __mod_memcg_state+0x8e/0x160 [87452.330738] __extent_writepage+0x2d4/0x400 [btrfs] [87452.331405] extent_write_cache_pages+0x2b2/0x500 [btrfs] [87452.332007] ? lock_release+0x20e/0x4c0 [87452.332557] ? trace_hardirqs_on+0x1b/0xf0 [87452.333127] extent_writepages+0x43/0x90 [btrfs] [87452.333653] ? lock_acquire+0x1a3/0x490 [87452.334177] do_writepages+0x43/0xe0 [87452.334699] ? __filemap_fdatawrite_range+0xa4/0x100 [87452.335720] __filemap_fdatawrite_range+0xc5/0x100 [87452.336500] btrfs_run_delalloc_work+0x17/0x40 [btrfs] [87452.337216] btrfs_work_helper+0xf1/0x600 [btrfs] [87452.337838] process_one_work+0x24e/0x5e0 [87452.338437] worker_thread+0x50/0x3b0 [87452.339137] ? process_one_work+0x5e0/0x5e0 [87452.339884] kthread+0x153/0x170 [87452.340507] ? kthread_mod_delayed_work+0xc0/0xc0 [87452.341153] ret_from_fork+0x22/0x30 [87452.341806] INFO: task kworker/u16:1:2426217 blocked for more than 120 seconds. [87452.342487] Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 [87452.343274] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [87452.344049] task:kworker/u16:1 state:D stack: 0 pid:2426217 ppid: 2 flags:0x00004000 [87452.344974] Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs] [87452.345655] Call Trace: [87452.346305] __schedule+0x5d1/0xcf0 [87452.346947] ? kvm_clock_read+0x14/0x30 [87452.347676] ? wait_for_completion+0x81/0x110 [87452.348389] schedule+0x45/0xe0 [87452.349077] schedule_timeout+0x30c/0x580 [87452.349718] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [87452.350340] ? lock_acquire+0x1a3/0x490 [87452.351006] ? try_to_wake_up+0x7a/0xa20 [87452.351541] ? lock_release+0x20e/0x4c0 [87452.352040] ? lock_acquired+0x199/0x490 [87452.352517] ? wait_for_completion+0x81/0x110 [87452.353000] wait_for_completion+0xab/0x110 [87452.353490] start_delalloc_inodes+0x2af/0x390 [btrfs] [87452.353973] btrfs_start_delalloc_roots+0x12d/0x250 [btrfs] [87452.354455] flush_space+0x24f/0x660 [btrfs] [87452.355063] btrfs_async_reclaim_metadata_space+0x1bb/0x480 [btrfs] [87452.355565] process_one_work+0x24e/0x5e0 [87452.356024] worker_thread+0x20f/0x3b0 [87452.356487] ? process_one_work+0x5e0/0x5e0 [87452.356973] kthread+0x153/0x170 [87452.357434] ? kthread_mod_delayed_work+0xc0/0xc0 [87452.357880] ret_from_fork+0x22/0x30 (...) < stack traces of several tasks waiting for the locks of the inodes of the clone operation > (...) [92867.444138] RSP: 002b:00007ffc3371bbe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000052 [92867.444624] RAX: ffffffffffffffda RBX: 00007ffc3371bea0 RCX: 00007f61efe73f97 [92867.445116] RDX: 0000000000000000 RSI: 0000560fbd5d7a40 RDI: 0000560fbd5d8960 [92867.445595] RBP: 00007ffc3371beb0 R08: 0000000000000001 R09: 0000000000000003 [92867.446070] R10: 00007ffc3371b996 R11: 0000000000000246 R12: 0000000000000000 [92867.446820] R13: 000000000000001f R14: 00007ffc3371bea0 R15: 00007ffc3371beb0 [92867.447361] task:fsstress state:D stack: 0 pid:2508238 ppid:2508153 flags:0x00004000 [92867.447920] Call Trace: [92867.448435] __schedule+0x5d1/0xcf0 [92867.448934] ? _raw_spin_unlock_irqrestore+0x3c/0x60 [92867.449423] schedule+0x45/0xe0 [92867.449916] __reserve_bytes+0x4a4/0xb10 [btrfs] [92867.450576] ? finish_wait+0x90/0x90 [92867.451202] btrfs_reserve_metadata_bytes+0x29/0x190 [btrfs] [92867.451815] btrfs_block_rsv_add+0x1f/0x50 [btrfs] [92867.452412] start_transaction+0x2d1/0x760 [btrfs] [92867.453216] clone_copy_inline_extent+0x333/0x490 [btrfs] [92867.453848] ? lock_release+0x20e/0x4c0 [92867.454539] ? btrfs_search_slot+0x9a7/0xc30 [btrfs] [92867.455218] btrfs_clone+0x569/0x7e0 [btrfs] [92867.455952] btrfs_clone_files+0xf6/0x150 [btrfs] [92867.456588] btrfs_remap_file_range+0x324/0x3d0 [btrfs] [92867.457213] do_clone_file_range+0xd4/0x1f0 [92867.457828] vfs_clone_file_range+0x4d/0x230 [92867.458355] ? lock_release+0x20e/0x4c0 [92867.458890] ioctl_file_clone+0x8f/0xc0 [92867.459377] do_vfs_ioctl+0x342/0x750 [92867.459913] __x64_sys_ioctl+0x62/0xb0 [92867.460377] do_syscall_64+0x33/0x80 [92867.460842] entry_SYSCALL_64_after_hwframe+0x44/0xa9 (...) < stack traces of more tasks blocked on metadata reservation like the clone task above, because the async reclaim task has deadlocked > (...) Another thing to notice is that the worker task that is deadlocked when trying to flush the destination inode of the clone operation is at btrfs_invalidatepage(). This is simply because the clone operation has a destination offset greater than the i_size and we only update the i_size of the destination file after cloning an extent (just like we do in the buffered write path). Since the async reclaim path uses btrfs_start_delalloc_roots() to trigger the flushing of delalloc for all inodes that have delalloc, add a runtime flag to an inode to signal it should not be flushed, and for inodes with that flag set, start_delalloc_inodes() will simply skip them. When the cloning code needs to dirty a page to copy an inline extent, set that flag on the inode and then clear it when the clone operation finishes. This could be sporadically triggered with test case generic/269 from fstests, which exercises many fsstress processes running in parallel with several dd processes filling up the entire filesystem. CC: stable@vger.kernel.org # 5.9+ Fixes: 05a5a7621ce6 ("Btrfs: implement full reflink support for inline extents") Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 9 +++++++++ fs/btrfs/ctree.h | 3 ++- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/inode.c | 15 +++++++++++---- fs/btrfs/ioctl.c | 2 +- fs/btrfs/reflink.c | 15 +++++++++++++++ fs/btrfs/space-info.c | 2 +- 7 files changed, 40 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 555cbcef6585..d9bf53d9ff90 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -42,6 +42,15 @@ enum { * to an inode. */ BTRFS_INODE_NO_XATTRS, + /* + * Set when we are in a context where we need to start a transaction and + * have dirty pages with the respective file range locked. This is to + * ensure that when reserving space for the transaction, if we are low + * on available space and need to flush delalloc, we will not flush + * delalloc for this inode, because that could result in a deadlock (on + * the file range, inode's io_tree). + */ + BTRFS_INODE_NO_DELALLOC_FLUSH, }; /* in memory btrfs inode */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9dde7707873a..2674f24cf2e0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3074,7 +3074,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, + bool in_reclaim_context); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a98e33f232d5..324f646d6e5e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -715,7 +715,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8e23780acfae..070716650df8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9390,7 +9390,8 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot) +static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot, + bool in_reclaim_context) { struct btrfs_inode *binode; struct inode *inode; @@ -9411,6 +9412,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot list_move_tail(&binode->delalloc_inodes, &root->delalloc_inodes); + + if (in_reclaim_context && + test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) + continue; + inode = igrab(&binode->vfs_inode); if (!inode) { cond_resched_lock(&root->delalloc_lock); @@ -9464,10 +9470,11 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - return start_delalloc_inodes(root, &nr, true); + return start_delalloc_inodes(root, &nr, true, false); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, + bool in_reclaim_context) { struct btrfs_root *root; struct list_head splice; @@ -9490,7 +9497,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, &nr, false); + ret = start_delalloc_inodes(root, &nr, false, in_reclaim_context); btrfs_put_root(root); if (ret < 0) goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 703212ff50a5..dde49a791f3e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4951,7 +4951,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index ab80896315be..b03e7891394e 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -89,6 +89,19 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret) goto out_unlock; + /* + * After dirtying the page our caller will need to start a transaction, + * and if we are low on metadata free space, that can cause flushing of + * delalloc for all inodes in order to get metadata space released. + * However we are holding the range locked for the whole duration of + * the clone/dedupe operation, so we may deadlock if that happens and no + * other task releases enough space. So mark this inode as not being + * possible to flush to avoid such deadlock. We will clear that flag + * when we finish cloning all extents, since a transaction is started + * after finding each extent to clone. + */ + set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); + if (comp_type == BTRFS_COMPRESS_NONE) { char *map; @@ -549,6 +562,8 @@ process_slot: out: btrfs_free_path(path); kvfree(buf); + clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); + return ret; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 64099565ab8f..67e55c5479b8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -532,7 +532,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, loops = 0; while ((delalloc_bytes || dio_bytes) && loops < 3) { - btrfs_start_delalloc_roots(fs_info, items); + btrfs_start_delalloc_roots(fs_info, items, true); loops++; if (wait_ordered && !trans) { From 9a664971569daf68254928149f580b4f5856d274 Mon Sep 17 00:00:00 2001 From: ethanwu Date: Tue, 1 Dec 2020 17:25:12 +0800 Subject: [PATCH 062/547] btrfs: correctly calculate item size used when item key collision happens Item key collision is allowed for some item types, like dir item and inode refs, but the overall item size is limited by the nodesize. item size(ins_len) passed from btrfs_insert_empty_items to btrfs_search_slot already contains size of btrfs_item. When btrfs_search_slot reaches leaf, we'll see if we need to split leaf. The check incorrectly reports that split leaf is required, because it treats the space required by the newly inserted item as btrfs_item + item data. But in item key collision case, only item data is actually needed, the newly inserted item could merge into the existing one. No new btrfs_item will be inserted. And split_leaf return EOVERFLOW from following code: if (extend && data_size + btrfs_item_size_nr(l, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) return -EOVERFLOW; In most cases, when callers receive EOVERFLOW, they either return this error or handle in different ways. For example, in normal dir item creation the userspace will get errno EOVERFLOW; in inode ref case INODE_EXTREF is used instead. However, this is not the case for rename. To avoid the unrecoverable situation in rename, btrfs_check_dir_item_collision is called in early phase of rename. In this function, when item key collision is detected leaf space is checked: data_size = sizeof(*di) + name_len; if (data_size + btrfs_item_size_nr(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) the sizeof(struct btrfs_item) + btrfs_item_size_nr(leaf, slot) here refers to existing item size, the condition here correctly calculates the needed size for collision case rather than the wrong case above. The consequence of inconsistent condition check between btrfs_check_dir_item_collision and btrfs_search_slot when item key collision happens is that we might pass check here but fail later at btrfs_search_slot. Rename fails and volume is forced readonly [436149.586170] ------------[ cut here ]------------ [436149.586173] BTRFS: Transaction aborted (error -75) [436149.586196] WARNING: CPU: 0 PID: 16733 at fs/btrfs/inode.c:9870 btrfs_rename2+0x1938/0x1b70 [btrfs] [436149.586227] CPU: 0 PID: 16733 Comm: python Tainted: G D 4.18.0-rc5+ #1 [436149.586228] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/05/2016 [436149.586238] RIP: 0010:btrfs_rename2+0x1938/0x1b70 [btrfs] [436149.586254] RSP: 0018:ffffa327043a7ce0 EFLAGS: 00010286 [436149.586255] RAX: 0000000000000000 RBX: ffff8d8a17d13340 RCX: 0000000000000006 [436149.586256] RDX: 0000000000000007 RSI: 0000000000000096 RDI: ffff8d8a7fc164b0 [436149.586257] RBP: ffffa327043a7da0 R08: 0000000000000560 R09: 7265282064657472 [436149.586258] R10: 0000000000000000 R11: 6361736e61725420 R12: ffff8d8a0d4c8b08 [436149.586258] R13: ffff8d8a17d13340 R14: ffff8d8a33e0a540 R15: 00000000000001fe [436149.586260] FS: 00007fa313933740(0000) GS:ffff8d8a7fc00000(0000) knlGS:0000000000000000 [436149.586261] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [436149.586262] CR2: 000055d8d9c9a720 CR3: 000000007aae0003 CR4: 00000000003606f0 [436149.586295] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [436149.586296] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [436149.586296] Call Trace: [436149.586311] vfs_rename+0x383/0x920 [436149.586313] ? vfs_rename+0x383/0x920 [436149.586315] do_renameat2+0x4ca/0x590 [436149.586317] __x64_sys_rename+0x20/0x30 [436149.586324] do_syscall_64+0x5a/0x120 [436149.586330] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [436149.586332] RIP: 0033:0x7fa3133b1d37 [436149.586348] RSP: 002b:00007fffd3e43908 EFLAGS: 00000246 ORIG_RAX: 0000000000000052 [436149.586349] RAX: ffffffffffffffda RBX: 00007fa3133b1d30 RCX: 00007fa3133b1d37 [436149.586350] RDX: 000055d8da06b5e0 RSI: 000055d8da225d60 RDI: 000055d8da2c4da0 [436149.586351] RBP: 000055d8da2252f0 R08: 00007fa313782000 R09: 00000000000177e0 [436149.586351] R10: 000055d8da010680 R11: 0000000000000246 R12: 00007fa313840b00 Thanks to Hans van Kranenburg for information about crc32 hash collision tools, I was able to reproduce the dir item collision with following python script. https://github.com/wutzuchieh/misc_tools/blob/master/crc32_forge.py Run it under a btrfs volume will trigger the abort transaction. It simply creates files and rename them to forged names that leads to hash collision. There are two ways to fix this. One is to simply revert the patch 878f2d2cb355 ("Btrfs: fix max dir item size calculation") to make the condition consistent although that patch is correct about the size. The other way is to handle the leaf space check correctly when collision happens. I prefer the second one since it correct leaf space check in collision case. This fix will not account sizeof(struct btrfs_item) when the item already exists. There are two places where ins_len doesn't contain sizeof(struct btrfs_item), however. 1. extent-tree.c: lookup_inline_extent_backref 2. file-item.c: btrfs_csum_file_blocks to make the logic of btrfs_search_slot more clear, we add a flag search_for_extension in btrfs_path. This flag indicates that ins_len passed to btrfs_search_slot doesn't contain sizeof(struct btrfs_item). When key exists, btrfs_search_slot will use the actual size needed to calculate the required leaf space. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: ethanwu Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 24 ++++++++++++++++++++++-- fs/btrfs/ctree.h | 6 ++++++ fs/btrfs/extent-tree.c | 2 ++ fs/btrfs/file-item.c | 2 ++ 4 files changed, 32 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 07810891e204..cc89b63d65a4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2555,8 +2555,14 @@ out: * @p: Holds all btree nodes along the search path * @root: The root node of the tree * @key: The key we are looking for - * @ins_len: Indicates purpose of search, for inserts it is 1, for - * deletions it's -1. 0 for plain searches + * @ins_len: Indicates purpose of search: + * >0 for inserts it's size of item inserted (*) + * <0 for deletions + * 0 for plain searches, not modifying the tree + * + * (*) If size of item inserted doesn't include + * sizeof(struct btrfs_item), then p->search_for_extension must + * be set. * @cow: boolean should CoW operations be performed. Must always be 1 * when modifying the tree. * @@ -2717,6 +2723,20 @@ cow_done: if (level == 0) { p->slots[level] = slot; + /* + * Item key already exists. In this case, if we are + * allowed to insert the item (for example, in dir_item + * case, item key collision is allowed), it will be + * merged with the original item. Only the item size + * grows, no new btrfs item will be added. If + * search_for_extension is not set, ins_len already + * accounts the size btrfs_item, deduct it here so leaf + * space check will be correct. + */ + if (ret == 0 && ins_len > 0 && !p->search_for_extension) { + ASSERT(ins_len >= sizeof(struct btrfs_item)); + ins_len -= sizeof(struct btrfs_item); + } if (ins_len > 0 && btrfs_leaf_free_space(b) < ins_len) { if (write_lock_level < 1) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2674f24cf2e0..3935d297d198 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -368,6 +368,12 @@ struct btrfs_path { unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; + /* + * Indicate that new item (btrfs_search_slot) is extending already + * existing item and ins_len contains only the data size and not item + * header (ie. sizeof(struct btrfs_item) is not included). + */ + unsigned int search_for_extension:1; }; #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 56ea380f5a17..d79b8369e6aa 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -844,6 +844,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, want = extent_ref_type(parent, owner); if (insert) { extra_size = btrfs_extent_inline_ref_size(want); + path->search_for_extension = 1; path->keep_locks = 1; } else extra_size = -1; @@ -996,6 +997,7 @@ again: out: if (insert) { path->keep_locks = 0; + path->search_for_extension = 0; btrfs_unlock_up_safe(path, 1); } return err; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 1545c22ef280..6ccfc019ad90 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -1016,8 +1016,10 @@ again: } btrfs_release_path(path); + path->search_for_extension = 1; ret = btrfs_search_slot(trans, root, &file_key, path, csum_size, 1); + path->search_for_extension = 0; if (ret < 0) goto out; From ae5e070eaca9dbebde3459dd8f4c2756f8c097d0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 4 Dec 2020 09:24:47 +0800 Subject: [PATCH 063/547] btrfs: qgroup: don't try to wait flushing if we're already holding a transaction There is a chance of racing for qgroup flushing which may lead to deadlock: Thread A | Thread B (not holding trans handle) | (holding a trans handle) --------------------------------+-------------------------------- __btrfs_qgroup_reserve_meta() | __btrfs_qgroup_reserve_meta() |- try_flush_qgroup() | |- try_flush_qgroup() |- QGROUP_FLUSHING bit set | | | | |- test_and_set_bit() | | |- wait_event() |- btrfs_join_transaction() | |- btrfs_commit_transaction()| !!! DEAD LOCK !!! Since thread A wants to commit transaction, but thread B is holding a transaction handle, blocking the commit. At the same time, thread B is waiting for thread A to finish its commit. This is just a hot fix, and would lead to more EDQUOT when we're near the qgroup limit. The proper fix would be to make all metadata/data reservations happen without holding a transaction handle. CC: stable@vger.kernel.org # 5.9+ Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fe3046007f52..47f27658eac1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3530,16 +3530,6 @@ static int try_flush_qgroup(struct btrfs_root *root) int ret; bool can_commit = true; - /* - * We don't want to run flush again and again, so if there is a running - * one, we won't try to start a new flush, but exit directly. - */ - if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { - wait_event(root->qgroup_flush_wait, - !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); - return 0; - } - /* * If current process holds a transaction, we shouldn't flush, as we * assume all space reservation happens before a transaction handle is @@ -3554,6 +3544,26 @@ static int try_flush_qgroup(struct btrfs_root *root) current->journal_info != BTRFS_SEND_TRANS_STUB) can_commit = false; + /* + * We don't want to run flush again and again, so if there is a running + * one, we won't try to start a new flush, but exit directly. + */ + if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { + /* + * We are already holding a transaction, thus we can block other + * threads from flushing. So exit right now. This increases + * the chance of EDQUOT for heavy load and near limit cases. + * But we can argue that if we're already near limit, EDQUOT is + * unavoidable anyway. + */ + if (!can_commit) + return 0; + + wait_event(root->qgroup_flush_wait, + !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); + return 0; + } + ret = btrfs_start_delalloc_snapshot(root); if (ret < 0) goto out; From 0b3f407e6728d990ae1630a02c7b952c21c288d3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 10 Dec 2020 12:09:02 +0000 Subject: [PATCH 064/547] btrfs: send: fix wrong file path when there is an inode with a pending rmdir When doing an incremental send, if we have a new inode that happens to have the same number that an old directory inode had in the base snapshot and that old directory has a pending rmdir operation, we end up computing a wrong path for the new inode, causing the receiver to fail. Example reproducer: $ cat test-send-rmdir.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV >/dev/null mount $DEV $MNT mkdir $MNT/dir touch $MNT/dir/file1 touch $MNT/dir/file2 touch $MNT/dir/file3 # Filesystem looks like: # # . (ino 256) # |----- dir/ (ino 257) # |----- file1 (ino 258) # |----- file2 (ino 259) # |----- file3 (ino 260) # btrfs subvolume snapshot -r $MNT $MNT/snap1 btrfs send -f /tmp/snap1.send $MNT/snap1 # Now remove our directory and all its files. rm -fr $MNT/dir # Unmount the filesystem and mount it again. This is to ensure that # the next inode that is created ends up with the same inode number # that our directory "dir" had, 257, which is the first free "objectid" # available after mounting again the filesystem. umount $MNT mount $DEV $MNT # Now create a new file (it could be a directory as well). touch $MNT/newfile # Filesystem now looks like: # # . (ino 256) # |----- newfile (ino 257) # btrfs subvolume snapshot -r $MNT $MNT/snap2 btrfs send -f /tmp/snap2.send -p $MNT/snap1 $MNT/snap2 # Now unmount the filesystem, create a new one, mount it and try to apply # both send streams to recreate both snapshots. umount $DEV mkfs.btrfs -f $DEV >/dev/null mount $DEV $MNT btrfs receive -f /tmp/snap1.send $MNT btrfs receive -f /tmp/snap2.send $MNT umount $MNT When running the test, the receive operation for the incremental stream fails: $ ./test-send-rmdir.sh Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1' At subvol /mnt/sdi/snap1 Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2' At subvol /mnt/sdi/snap2 At subvol snap1 At snapshot snap2 ERROR: chown o257-9-0 failed: No such file or directory So fix this by tracking directories that have a pending rmdir by inode number and generation number, instead of only inode number. A test case for fstests follows soon. Reported-by: Massimo B. Tested-by: Massimo B. Link: https://lore.kernel.org/linux-btrfs/6ae34776e85912960a253a8327068a892998e685.camel@gmx.net/ CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d719a2755a40..ae97f4dbaff3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -236,6 +236,7 @@ struct waiting_dir_move { * after this directory is moved, we can try to rmdir the ino rmdir_ino. */ u64 rmdir_ino; + u64 rmdir_gen; bool orphanized; }; @@ -316,7 +317,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); static struct waiting_dir_move * get_waiting_dir_move(struct send_ctx *sctx, u64 ino); -static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino); +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen); static int need_send_hole(struct send_ctx *sctx) { @@ -2299,7 +2300,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, fs_path_reset(name); - if (is_waiting_for_rm(sctx, ino)) { + if (is_waiting_for_rm(sctx, ino, gen)) { ret = gen_unique_name(sctx, ino, gen, name); if (ret < 0) goto out; @@ -2858,8 +2859,8 @@ out: return ret; } -static struct orphan_dir_info * -add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx, + u64 dir_ino, u64 dir_gen) { struct rb_node **p = &sctx->orphan_dirs.rb_node; struct rb_node *parent = NULL; @@ -2868,20 +2869,23 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) while (*p) { parent = *p; entry = rb_entry(parent, struct orphan_dir_info, node); - if (dir_ino < entry->ino) { + if (dir_ino < entry->ino) p = &(*p)->rb_left; - } else if (dir_ino > entry->ino) { + else if (dir_ino > entry->ino) p = &(*p)->rb_right; - } else { + else if (dir_gen < entry->gen) + p = &(*p)->rb_left; + else if (dir_gen > entry->gen) + p = &(*p)->rb_right; + else return entry; - } } odi = kmalloc(sizeof(*odi), GFP_KERNEL); if (!odi) return ERR_PTR(-ENOMEM); odi->ino = dir_ino; - odi->gen = 0; + odi->gen = dir_gen; odi->last_dir_index_offset = 0; rb_link_node(&odi->node, parent, p); @@ -2889,8 +2893,8 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) return odi; } -static struct orphan_dir_info * -get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) +static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx, + u64 dir_ino, u64 gen) { struct rb_node *n = sctx->orphan_dirs.rb_node; struct orphan_dir_info *entry; @@ -2901,15 +2905,19 @@ get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino) n = n->rb_left; else if (dir_ino > entry->ino) n = n->rb_right; + else if (gen < entry->gen) + n = n->rb_left; + else if (gen > entry->gen) + n = n->rb_right; else return entry; } return NULL; } -static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino) +static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen) { - struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino); + struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen); return odi != NULL; } @@ -2954,7 +2962,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; - odi = get_orphan_dir_info(sctx, dir); + odi = get_orphan_dir_info(sctx, dir, dir_gen); if (odi) key.offset = odi->last_dir_index_offset; @@ -2985,7 +2993,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, dm = get_waiting_dir_move(sctx, loc.objectid); if (dm) { - odi = add_orphan_dir_info(sctx, dir); + odi = add_orphan_dir_info(sctx, dir, dir_gen); if (IS_ERR(odi)) { ret = PTR_ERR(odi); goto out; @@ -2993,12 +3001,13 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, odi->gen = dir_gen; odi->last_dir_index_offset = found_key.offset; dm->rmdir_ino = dir; + dm->rmdir_gen = dir_gen; ret = 0; goto out; } if (loc.objectid > send_progress) { - odi = add_orphan_dir_info(sctx, dir); + odi = add_orphan_dir_info(sctx, dir, dir_gen); if (IS_ERR(odi)) { ret = PTR_ERR(odi); goto out; @@ -3038,6 +3047,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) return -ENOMEM; dm->ino = ino; dm->rmdir_ino = 0; + dm->rmdir_gen = 0; dm->orphanized = orphanized; while (*p) { @@ -3183,7 +3193,7 @@ static int path_loop(struct send_ctx *sctx, struct fs_path *name, while (ino != BTRFS_FIRST_FREE_OBJECTID) { fs_path_reset(name); - if (is_waiting_for_rm(sctx, ino)) + if (is_waiting_for_rm(sctx, ino, gen)) break; if (is_waiting_for_move(sctx, ino)) { if (*ancestor_ino == 0) @@ -3223,6 +3233,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) u64 parent_ino, parent_gen; struct waiting_dir_move *dm = NULL; u64 rmdir_ino = 0; + u64 rmdir_gen; u64 ancestor; bool is_orphan; int ret; @@ -3237,6 +3248,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) dm = get_waiting_dir_move(sctx, pm->ino); ASSERT(dm); rmdir_ino = dm->rmdir_ino; + rmdir_gen = dm->rmdir_gen; is_orphan = dm->orphanized; free_waiting_dir_move(sctx, dm); @@ -3273,6 +3285,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) dm = get_waiting_dir_move(sctx, pm->ino); ASSERT(dm); dm->rmdir_ino = rmdir_ino; + dm->rmdir_gen = rmdir_gen; } goto out; } @@ -3291,7 +3304,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) struct orphan_dir_info *odi; u64 gen; - odi = get_orphan_dir_info(sctx, rmdir_ino); + odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen); if (!odi) { /* already deleted */ goto finish; From 675a4fc8f3149e93f35fb5739fd8d4764206ba0b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Dec 2020 12:00:26 -0500 Subject: [PATCH 065/547] btrfs: tests: initialize test inodes location I noticed that sometimes the module failed to load because the self tests failed like this: BTRFS: selftest: fs/btrfs/tests/inode-tests.c:963 miscount, wanted 1, got 0 This turned out to be because sometimes the btrfs ino would be the btree inode number, and thus we'd skip calling the set extent delalloc bit helper, and thus not adjust ->outstanding_extents. Fix this by making sure we initialize test inodes with a valid inode number so that we don't get random failures during self tests. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tests/btrfs-tests.c | 10 ++++++++-- fs/btrfs/tests/inode-tests.c | 9 --------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 8ca334d554af..6bd97bd4cb37 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -55,8 +55,14 @@ struct inode *btrfs_new_test_inode(void) struct inode *inode; inode = new_inode(test_mnt->mnt_sb); - if (inode) - inode_init_owner(inode, NULL, S_IFREG); + if (!inode) + return NULL; + + inode->i_mode = S_IFREG; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; + BTRFS_I(inode)->location.offset = 0; + inode_init_owner(inode, NULL, S_IFREG); return inode; } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 04022069761d..c9874b12d337 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -232,11 +232,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) return ret; } - inode->i_mode = S_IFREG; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; - BTRFS_I(inode)->location.offset = 0; - fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { test_std_err(TEST_ALLOC_FS_INFO); @@ -835,10 +830,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) return ret; } - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; - BTRFS_I(inode)->location.offset = 0; - fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { test_std_err(TEST_ALLOC_FS_INFO); From ea9ed87c73e87e044b2c58d658eb4ba5216bc488 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 6 Dec 2020 15:56:20 +0000 Subject: [PATCH 066/547] btrfs: fix async discard stall Might happen that bg->discard_eligible_time was changed without rescheduling, so btrfs_discard_workfn() wakes up earlier than that new time, peek_discard_list() returns NULL, and all work halts and goes to sleep without further rescheduling even there are block groups to discard. It happens pretty often, but not so visible from the userspace because after some time it usually will be kicked off anyway by someone else calling btrfs_discard_reschedule_work(). Fix it by continue rescheduling if block group discard lists are not empty. Reviewed-by: Josef Bacik Signed-off-by: Pavel Begunkov Signed-off-by: David Sterba --- fs/btrfs/discard.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 1db966bf85b2..36431d7e1334 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -199,16 +199,15 @@ static struct btrfs_block_group *find_next_block_group( static struct btrfs_block_group *peek_discard_list( struct btrfs_discard_ctl *discard_ctl, enum btrfs_discard_state *discard_state, - int *discard_index) + int *discard_index, u64 now) { struct btrfs_block_group *block_group; - const u64 now = ktime_get_ns(); spin_lock(&discard_ctl->lock); again: block_group = find_next_block_group(discard_ctl, now); - if (block_group && now > block_group->discard_eligible_time) { + if (block_group && now >= block_group->discard_eligible_time) { if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && block_group->used != 0) { if (btrfs_is_block_group_data_only(block_group)) @@ -222,12 +221,11 @@ again: block_group->discard_state = BTRFS_DISCARD_EXTENTS; } discard_ctl->block_group = block_group; + } + if (block_group) { *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; - } else { - block_group = NULL; } - spin_unlock(&discard_ctl->lock); return block_group; @@ -438,13 +436,18 @@ static void btrfs_discard_workfn(struct work_struct *work) int discard_index = 0; u64 trimmed = 0; u64 minlen = 0; + u64 now = ktime_get_ns(); discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); block_group = peek_discard_list(discard_ctl, &discard_state, - &discard_index); + &discard_index, now); if (!block_group || !btrfs_run_discard_work(discard_ctl)) return; + if (now < block_group->discard_eligible_time) { + btrfs_discard_schedule_work(discard_ctl, false); + return; + } /* Perform discarding */ minlen = discard_minlen[discard_index]; From 1ea2872fc6f2aaee0a4b4f1578b83ffd9f55c6a7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 6 Dec 2020 15:56:21 +0000 Subject: [PATCH 067/547] btrfs: fix racy access to discard_ctl data Because only one discard worker may be running at any given point, it could have been safe to modify ->prev_discard, etc. without synchronization, if not for @override flag in btrfs_discard_schedule_work() and delayed_work_pending() returning false while workfn is running. That may lead to torn reads of u64 for some architectures, but that's not a big problem as only slightly affects the discard rate. Suggested-by: Josef Bacik Reviewed-by: Josef Bacik Signed-off-by: Pavel Begunkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/discard.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 36431d7e1334..d641f451f840 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -477,13 +477,6 @@ static void btrfs_discard_workfn(struct work_struct *work) discard_ctl->discard_extent_bytes += trimmed; } - /* - * Updated without locks as this is inside the workfn and nothing else - * is reading the values - */ - discard_ctl->prev_discard = trimmed; - discard_ctl->prev_discard_time = ktime_get_ns(); - /* Determine next steps for a block_group */ if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { if (discard_state == BTRFS_DISCARD_BITMAPS) { @@ -499,7 +492,10 @@ static void btrfs_discard_workfn(struct work_struct *work) } } + now = ktime_get_ns(); spin_lock(&discard_ctl->lock); + discard_ctl->prev_discard = trimmed; + discard_ctl->prev_discard_time = now; discard_ctl->block_group = NULL; spin_unlock(&discard_ctl->lock); From 8fc058597a283e9a37720abb0e8d68e342b9387d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 6 Dec 2020 15:56:22 +0000 Subject: [PATCH 068/547] btrfs: merge critical sections of discard lock in workfn btrfs_discard_workfn() drops discard_ctl->lock just to take it again in a moment in btrfs_discard_schedule_work(). Avoid that and also reuse ktime. Reviewed-by: Josef Bacik Signed-off-by: Pavel Begunkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/discard.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index d641f451f840..2b8383d41144 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -328,28 +328,15 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, btrfs_discard_schedule_work(discard_ctl, false); } -/** - * btrfs_discard_schedule_work - responsible for scheduling the discard work - * @discard_ctl: discard control - * @override: override the current timer - * - * Discards are issued by a delayed workqueue item. @override is used to - * update the current delay as the baseline delay interval is reevaluated on - * transaction commit. This is also maxed with any other rate limit. - */ -void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, - bool override) +static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, + u64 now, bool override) { struct btrfs_block_group *block_group; - const u64 now = ktime_get_ns(); - - spin_lock(&discard_ctl->lock); if (!btrfs_run_discard_work(discard_ctl)) - goto out; - + return; if (!override && delayed_work_pending(&discard_ctl->work)) - goto out; + return; block_group = find_next_block_group(discard_ctl, now); if (block_group) { @@ -391,7 +378,24 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, mod_delayed_work(discard_ctl->discard_workers, &discard_ctl->work, nsecs_to_jiffies(delay)); } -out: +} + +/* + * btrfs_discard_schedule_work - responsible for scheduling the discard work + * @discard_ctl: discard control + * @override: override the current timer + * + * Discards are issued by a delayed workqueue item. @override is used to + * update the current delay as the baseline delay interval is reevaluated on + * transaction commit. This is also maxed with any other rate limit. + */ +void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, + bool override) +{ + const u64 now = ktime_get_ns(); + + spin_lock(&discard_ctl->lock); + __btrfs_discard_schedule_work(discard_ctl, now, override); spin_unlock(&discard_ctl->lock); } @@ -497,9 +501,8 @@ static void btrfs_discard_workfn(struct work_struct *work) discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; discard_ctl->block_group = NULL; + __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); - - btrfs_discard_schedule_work(discard_ctl, false); } /** From cb13eea3b49055bd78e6ddf39defd6340f7379fc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 14 Dec 2020 10:10:45 +0000 Subject: [PATCH 069/547] btrfs: fix transaction leak and crash after RO remount caused by qgroup rescan If we remount a filesystem in RO mode while the qgroup rescan worker is running, we can end up having it still running after the remount is done, and at unmount time we may end up with an open transaction that ends up never getting committed. If that happens we end up with several memory leaks and can crash when hardware acceleration is unavailable for crc32c. Possibly it can lead to other nasty surprises too, due to use-after-free issues. The following steps explain how the problem happens. 1) We have a filesystem mounted in RW mode and the qgroup rescan worker is running; 2) We remount the filesystem in RO mode, and never stop/pause the rescan worker, so after the remount the rescan worker is still running. The important detail here is that the rescan task is still running after the remount operation committed any ongoing transaction through its call to btrfs_commit_super(); 3) The rescan is still running, and after the remount completed, the rescan worker started a transaction, after it finished iterating all leaves of the extent tree, to update the qgroup status item in the quotas tree. It does not commit the transaction, it only releases its handle on the transaction; 4) A filesystem unmount operation starts shortly after; 5) The unmount task, at close_ctree(), stops the transaction kthread, which had not had a chance to commit the open transaction since it was sleeping and the commit interval (default of 30 seconds) has not yet elapsed since the last time it committed a transaction; 6) So after stopping the transaction kthread we still have the transaction used to update the qgroup status item open. At close_ctree(), when the filesystem is in RO mode and no transaction abort happened (or the filesystem is in error mode), we do not expect to have any transaction open, so we do not call btrfs_commit_super(); 7) We then proceed to destroy the work queues, free the roots and block groups, etc. After that we drop the last reference on the btree inode by calling iput() on it. Since there are dirty pages for the btree inode, corresponding to the COWed extent buffer for the quotas btree, btree_write_cache_pages() is invoked to flush those dirty pages. This results in creating a bio and submitting it, which makes us end up at btrfs_submit_metadata_bio(); 8) At btrfs_submit_metadata_bio() we end up at the if-then-else branch that calls btrfs_wq_submit_bio(), because check_async_write() returned a value of 1. This value of 1 is because we did not have hardware acceleration available for crc32c, so BTRFS_FS_CSUM_IMPL_FAST was not set in fs_info->flags; 9) Then at btrfs_wq_submit_bio() we call btrfs_queue_work() against the workqueue at fs_info->workers, which was already freed before by the call to btrfs_stop_all_workers() at close_ctree(). This results in an invalid memory access due to a use-after-free, leading to a crash. When this happens, before the crash there are several warnings triggered, since we have reserved metadata space in a block group, the delayed refs reservation, etc: ------------[ cut here ]------------ WARNING: CPU: 4 PID: 1729896 at fs/btrfs/block-group.c:125 btrfs_put_block_group+0x63/0xa0 [btrfs] Modules linked in: btrfs dm_snapshot dm_thin_pool (...) CPU: 4 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_put_block_group+0x63/0xa0 [btrfs] Code: f0 01 00 00 48 39 c2 75 (...) RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206 RAX: 0000000000000001 RBX: ffff947ed73e4000 RCX: ffff947ebc8b29c8 RDX: 0000000000000001 RSI: ffffffffc0b150a0 RDI: ffff947ebc8b2800 RBP: ffff947ebc8b2800 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110 R13: ffff947ed73e4160 R14: ffff947ebc8b2988 R15: dead000000000100 FS: 00007f15edfea840(0000) GS:ffff9481ad600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f37e2893320 CR3: 0000000138f68001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_free_block_groups+0x17f/0x2f0 [btrfs] close_ctree+0x2ba/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f15ee221ee7 Code: ff 0b 00 f7 d8 64 89 01 48 (...) RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7 RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000 RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0 R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000 R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last enabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace dd74718fef1ed5c6 ]--- ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-rsv.c:459 btrfs_release_global_block_rsv+0x70/0xc0 [btrfs] Modules linked in: btrfs dm_snapshot dm_thin_pool (...) CPU: 2 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_release_global_block_rsv+0x70/0xc0 [btrfs] Code: 48 83 bb b0 03 00 00 00 (...) RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206 RAX: 000000000033c000 RBX: ffff947ed73e4000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffffffc0b0d8c1 RDI: 00000000ffffffff RBP: ffff947ebc8b7000 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110 R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100 FS: 00007f15edfea840(0000) GS:ffff9481aca00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000561a79f76e20 CR3: 0000000138f68006 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_free_block_groups+0x24c/0x2f0 [btrfs] close_ctree+0x2ba/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f15ee221ee7 Code: ff 0b 00 f7 d8 64 89 01 (...) RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7 RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000 RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0 R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000 R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last enabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace dd74718fef1ed5c7 ]--- ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-group.c:3377 btrfs_free_block_groups+0x25d/0x2f0 [btrfs] Modules linked in: btrfs dm_snapshot dm_thin_pool (...) CPU: 5 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_free_block_groups+0x25d/0x2f0 [btrfs] Code: ad de 49 be 22 01 00 (...) RSP: 0018:ffffb270826bbde8 EFLAGS: 00010206 RAX: ffff947ebeae1d08 RBX: ffff947ed73e4000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffff947e9d823ae8 RDI: 0000000000000246 RBP: ffff947ebeae1d08 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ebeae1c00 R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100 FS: 00007f15edfea840(0000) GS:ffff9481ad200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1475d98ea8 CR3: 0000000138f68005 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: close_ctree+0x2ba/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f15ee221ee7 Code: ff 0b 00 f7 d8 64 89 (...) RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7 RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000 RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0 R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000 R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last enabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace dd74718fef1ed5c8 ]--- BTRFS info (device sdc): space_info 4 has 268238848 free, is not full BTRFS info (device sdc): space_info total=268435456, used=114688, pinned=0, reserved=16384, may_use=0, readonly=65536 BTRFS info (device sdc): global_block_rsv: size 0 reserved 0 BTRFS info (device sdc): trans_block_rsv: size 0 reserved 0 BTRFS info (device sdc): chunk_block_rsv: size 0 reserved 0 BTRFS info (device sdc): delayed_block_rsv: size 0 reserved 0 BTRFS info (device sdc): delayed_refs_rsv: size 524288 reserved 0 And the crash, which only happens when we do not have crc32c hardware acceleration, produces the following trace immediately after those warnings: stack segment: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI CPU: 2 PID: 1749129 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_queue_work+0x36/0x190 [btrfs] Code: 54 55 53 48 89 f3 (...) RSP: 0018:ffffb27082443ae8 EFLAGS: 00010282 RAX: 0000000000000004 RBX: ffff94810ee9ad90 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffff94810ee9ad90 RDI: ffff947ed8ee75a0 RBP: a56b6b6b6b6b6b6b R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000007 R11: 0000000000000001 R12: ffff947fa9b435a8 R13: ffff94810ee9ad90 R14: 0000000000000000 R15: ffff947e93dc0000 FS: 00007f3cfe974840(0000) GS:ffff9481ac600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1b42995a70 CR3: 0000000127638003 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_wq_submit_bio+0xb3/0xd0 [btrfs] btrfs_submit_metadata_bio+0x44/0xc0 [btrfs] submit_one_bio+0x61/0x70 [btrfs] btree_write_cache_pages+0x414/0x450 [btrfs] ? kobject_put+0x9a/0x1d0 ? trace_hardirqs_on+0x1b/0xf0 ? _raw_spin_unlock_irqrestore+0x3c/0x60 ? free_debug_processing+0x1e1/0x2b0 do_writepages+0x43/0xe0 ? lock_acquired+0x199/0x490 __writeback_single_inode+0x59/0x650 writeback_single_inode+0xaf/0x120 write_inode_now+0x94/0xd0 iput+0x187/0x2b0 close_ctree+0x2c6/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f3cfebabee7 Code: ff 0b 00 f7 d8 64 89 01 (...) RSP: 002b:00007ffc9c9a05f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 00007f3cfecd1264 RCX: 00007f3cfebabee7 RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 0000562b6b478000 RBP: 0000562b6b473a30 R08: 0000000000000000 R09: 00007f3cfec6cbe0 R10: 0000562b6b479fe0 R11: 0000000000000246 R12: 0000000000000000 R13: 0000562b6b478000 R14: 0000562b6b473b40 R15: 0000562b6b473c60 Modules linked in: btrfs dm_snapshot dm_thin_pool (...) ---[ end trace dd74718fef1ed5cc ]--- Finally when we remove the btrfs module (rmmod btrfs), there are several warnings about objects that were allocated from our slabs but were never freed, consequence of the transaction that was never committed and got leaked: ============================================================================= BUG btrfs_delayed_ref_head (Tainted: G B W ): Objects remaining in btrfs_delayed_ref_head on __kmem_cache_shutdown() ----------------------------------------------------------------------------- INFO: Slab 0x0000000094c2ae56 objects=24 used=2 fp=0x000000002bfa2521 flags=0x17fffc000010200 CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 slab_err+0xb7/0xdc ? lock_acquired+0x199/0x490 __kmem_cache_shutdown+0x1ac/0x3c0 ? lock_release+0x20e/0x4c0 kmem_cache_destroy+0x55/0x120 btrfs_delayed_ref_exit+0x11/0x35 [btrfs] exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 INFO: Object 0x0000000050cbdd61 @offset=12104 INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1894 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] btrfs_free_tree_block+0x128/0x360 [btrfs] __btrfs_cow_block+0x489/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=4292 cpu=2 pid=1729526 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] commit_cowonly_roots+0xfb/0x300 [btrfs] btrfs_commit_transaction+0x367/0xc40 [btrfs] sync_filesystem+0x74/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 INFO: Object 0x0000000086e9b0ff @offset=12776 INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1900 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] btrfs_alloc_tree_block+0x2bf/0x360 [btrfs] alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs] __btrfs_cow_block+0x12d/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=3141 cpu=6 pid=1729803 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] btrfs_write_dirty_block_groups+0x17d/0x3d0 [btrfs] commit_cowonly_roots+0x248/0x300 [btrfs] btrfs_commit_transaction+0x367/0xc40 [btrfs] close_ctree+0x113/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 kmem_cache_destroy btrfs_delayed_ref_head: Slab cache still has objects CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 kmem_cache_destroy+0x119/0x120 btrfs_delayed_ref_exit+0x11/0x35 [btrfs] exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 0b (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 ============================================================================= BUG btrfs_delayed_tree_ref (Tainted: G B W ): Objects remaining in btrfs_delayed_tree_ref on __kmem_cache_shutdown() ----------------------------------------------------------------------------- INFO: Slab 0x0000000011f78dc0 objects=37 used=2 fp=0x0000000032d55d91 flags=0x17fffc000010200 CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 slab_err+0xb7/0xdc ? lock_acquired+0x199/0x490 __kmem_cache_shutdown+0x1ac/0x3c0 ? lock_release+0x20e/0x4c0 kmem_cache_destroy+0x55/0x120 btrfs_delayed_ref_exit+0x1d/0x35 [btrfs] exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 INFO: Object 0x000000001a340018 @offset=4408 INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1917 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] btrfs_free_tree_block+0x128/0x360 [btrfs] __btrfs_cow_block+0x489/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=4167 cpu=4 pid=1729795 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] btrfs_commit_transaction+0x60/0xc40 [btrfs] create_subvol+0x56a/0x990 [btrfs] btrfs_mksubvol+0x3fb/0x4a0 [btrfs] __btrfs_ioctl_snap_create+0x119/0x1a0 [btrfs] btrfs_ioctl_snap_create+0x58/0x80 [btrfs] btrfs_ioctl+0x1a92/0x36f0 [btrfs] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 INFO: Object 0x000000002b46292a @offset=13648 INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1923 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] btrfs_alloc_tree_block+0x2bf/0x360 [btrfs] alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs] __btrfs_cow_block+0x12d/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=3164 cpu=6 pid=1729803 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] commit_cowonly_roots+0xfb/0x300 [btrfs] btrfs_commit_transaction+0x367/0xc40 [btrfs] close_ctree+0x113/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 kmem_cache_destroy btrfs_delayed_tree_ref: Slab cache still has objects CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 kmem_cache_destroy+0x119/0x120 btrfs_delayed_ref_exit+0x1d/0x35 [btrfs] exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 ============================================================================= BUG btrfs_delayed_extent_op (Tainted: G B W ): Objects remaining in btrfs_delayed_extent_op on __kmem_cache_shutdown() ----------------------------------------------------------------------------- INFO: Slab 0x00000000f145ce2f objects=22 used=1 fp=0x00000000af0f92cf flags=0x17fffc000010200 CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 slab_err+0xb7/0xdc ? lock_acquired+0x199/0x490 __kmem_cache_shutdown+0x1ac/0x3c0 ? __mutex_unlock_slowpath+0x45/0x2a0 kmem_cache_destroy+0x55/0x120 exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 INFO: Object 0x000000004cf95ea8 @offset=6264 INFO: Allocated in btrfs_alloc_tree_block+0x1e0/0x360 [btrfs] age=1931 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_alloc_tree_block+0x1e0/0x360 [btrfs] alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs] __btrfs_cow_block+0x12d/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] INFO: Freed in __btrfs_run_delayed_refs+0xabd/0x1290 [btrfs] age=3173 cpu=6 pid=1729803 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0xabd/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] commit_cowonly_roots+0xfb/0x300 [btrfs] btrfs_commit_transaction+0x367/0xc40 [btrfs] close_ctree+0x113/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 kmem_cache_destroy btrfs_delayed_extent_op: Slab cache still has objects CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 kmem_cache_destroy+0x119/0x120 exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 BTRFS: state leak: start 30408704 end 30425087 state 1 in tree 1 refs 1 Fix this issue by having the remount path stop the qgroup rescan worker when we are remounting RO and teach the rescan worker to stop when a remount is in progress. If later a remount in RW mode happens, we are already resuming the qgroup rescan worker through the call to btrfs_qgroup_rescan_resume(), so we do not need to worry about that. Tested-by: Fabian Vogt Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 13 ++++++++++--- fs/btrfs/super.c | 8 ++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 47f27658eac1..808370ada888 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3190,6 +3190,12 @@ out: return ret; } +static bool rescan_should_stop(struct btrfs_fs_info *fs_info) +{ + return btrfs_fs_closing(fs_info) || + test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); +} + static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) { struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, @@ -3198,6 +3204,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) struct btrfs_trans_handle *trans = NULL; int err = -ENOMEM; int ret = 0; + bool stopped = false; path = btrfs_alloc_path(); if (!path) @@ -3210,7 +3217,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) path->skip_locking = 1; err = 0; - while (!err && !btrfs_fs_closing(fs_info)) { + while (!err && !(stopped = rescan_should_stop(fs_info))) { trans = btrfs_start_transaction(fs_info->fs_root, 0); if (IS_ERR(trans)) { err = PTR_ERR(trans); @@ -3253,7 +3260,7 @@ out: } mutex_lock(&fs_info->qgroup_rescan_lock); - if (!btrfs_fs_closing(fs_info)) + if (!stopped) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; if (trans) { ret = update_qgroup_status_item(trans); @@ -3272,7 +3279,7 @@ out: btrfs_end_transaction(trans); - if (btrfs_fs_closing(fs_info)) { + if (stopped) { btrfs_info(fs_info, "qgroup scan paused"); } else if (err >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 022f20810089..b24fa62375e0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1968,6 +1968,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_scrub_cancel(fs_info); btrfs_pause_balance(fs_info); + /* + * Pause the qgroup rescan worker if it is running. We don't want + * it to be still running after we are in RO mode, as after that, + * by the time we unmount, it might have left a transaction open, + * so we would leak the transaction and/or crash. + */ + btrfs_qgroup_wait_for_completion(fs_info, false); + ret = btrfs_commit_super(fs_info); if (ret) goto restore; From 638331fa56caeaa8b4d31cc1dfbe0ce989bcff67 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 14 Dec 2020 10:10:46 +0000 Subject: [PATCH 070/547] btrfs: fix transaction leak and crash after cleaning up orphans on RO mount When we delete a root (subvolume or snapshot), at the very end of the operation, we attempt to remove the root's orphan item from the root tree, at btrfs_drop_snapshot(), by calling btrfs_del_orphan_item(). We ignore any error from btrfs_del_orphan_item() since it is not a serious problem and the next time the filesystem is mounted we remove such stray orphan items at btrfs_find_orphan_roots(). However if the filesystem is mounted RO and we have stray orphan items for any previously deleted root, we can end up leaking a transaction and other data structures when unmounting the filesystem, as well as crashing if we do not have hardware acceleration for crc32c available. The steps that lead to the transaction leak are the following: 1) The filesystem is mounted in RW mode; 2) A subvolume is deleted; 3) When the cleaner kthread runs btrfs_drop_snapshot() to delete the root, it gets a failure at btrfs_del_orphan_item(), which is ignored, due to an ENOMEM when allocating a path for example. So the orphan item for the root remains in the root tree; 4) The filesystem is unmounted; 5) The filesystem is mounted RO (-o ro). During the mount path we call btrfs_find_orphan_roots(), which iterates the root tree searching for orphan items. It finds the orphan item for our deleted root, and since it can not find the root, it starts a transaction to delete the orphan item (by calling btrfs_del_orphan_item()); 6) The RO mount completes; 7) Before the transaction kthread commits the transaction created for deleting the orphan item (i.e. less than 30 seconds elapsed since the mount, the default commit interval), a filesystem unmount operation is started; 8) At close_ctree(), we stop the transaction kthread, but we still have a transaction open with at least one dirty extent buffer, a leaf for the tree root which was COWed when deleting the orphan item; 9) We then proceed to destroy the work queues, free the roots and block groups, etc. After that we drop the last reference on the btree inode by calling iput() on it. Since there are dirty pages for the btree inode, corresponding to the COWed extent buffer, btree_write_cache_pages() is invoked to flush those dirty pages. This results in creating a bio and submitting it, which makes us end up at btrfs_submit_metadata_bio(); 10) At btrfs_submit_metadata_bio() we end up at the if-then-else branch that calls btrfs_wq_submit_bio(), because check_async_write() returned a value of 1. This value of 1 is because we did not have hardware acceleration available for crc32c, so BTRFS_FS_CSUM_IMPL_FAST was not set in fs_info->flags; 11) Then at btrfs_wq_submit_bio() we call btrfs_queue_work() against the workqueue at fs_info->workers, which was already freed before by the call to btrfs_stop_all_workers() at close_ctree(). This results in an invalid memory access due to a use-after-free, leading to a crash. When this happens, before the crash there are several warnings triggered, since we have reserved metadata space in a block group, the delayed refs reservation, etc: ------------[ cut here ]------------ WARNING: CPU: 4 PID: 1729896 at fs/btrfs/block-group.c:125 btrfs_put_block_group+0x63/0xa0 [btrfs] Modules linked in: btrfs dm_snapshot dm_thin_pool (...) CPU: 4 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_put_block_group+0x63/0xa0 [btrfs] Code: f0 01 00 00 48 39 c2 75 (...) RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206 RAX: 0000000000000001 RBX: ffff947ed73e4000 RCX: ffff947ebc8b29c8 RDX: 0000000000000001 RSI: ffffffffc0b150a0 RDI: ffff947ebc8b2800 RBP: ffff947ebc8b2800 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110 R13: ffff947ed73e4160 R14: ffff947ebc8b2988 R15: dead000000000100 FS: 00007f15edfea840(0000) GS:ffff9481ad600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f37e2893320 CR3: 0000000138f68001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_free_block_groups+0x17f/0x2f0 [btrfs] close_ctree+0x2ba/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f15ee221ee7 Code: ff 0b 00 f7 d8 64 89 01 48 (...) RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7 RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000 RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0 R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000 R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last enabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace dd74718fef1ed5c6 ]--- ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-rsv.c:459 btrfs_release_global_block_rsv+0x70/0xc0 [btrfs] Modules linked in: btrfs dm_snapshot dm_thin_pool (...) CPU: 2 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_release_global_block_rsv+0x70/0xc0 [btrfs] Code: 48 83 bb b0 03 00 00 00 (...) RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206 RAX: 000000000033c000 RBX: ffff947ed73e4000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffffffc0b0d8c1 RDI: 00000000ffffffff RBP: ffff947ebc8b7000 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110 R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100 FS: 00007f15edfea840(0000) GS:ffff9481aca00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000561a79f76e20 CR3: 0000000138f68006 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_free_block_groups+0x24c/0x2f0 [btrfs] close_ctree+0x2ba/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f15ee221ee7 Code: ff 0b 00 f7 d8 64 89 01 (...) RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7 RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000 RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0 R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000 R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last enabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace dd74718fef1ed5c7 ]--- ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-group.c:3377 btrfs_free_block_groups+0x25d/0x2f0 [btrfs] Modules linked in: btrfs dm_snapshot dm_thin_pool (...) CPU: 5 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_free_block_groups+0x25d/0x2f0 [btrfs] Code: ad de 49 be 22 01 00 (...) RSP: 0018:ffffb270826bbde8 EFLAGS: 00010206 RAX: ffff947ebeae1d08 RBX: ffff947ed73e4000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffff947e9d823ae8 RDI: 0000000000000246 RBP: ffff947ebeae1d08 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ebeae1c00 R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100 FS: 00007f15edfea840(0000) GS:ffff9481ad200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1475d98ea8 CR3: 0000000138f68005 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: close_ctree+0x2ba/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f15ee221ee7 Code: ff 0b 00 f7 d8 64 89 (...) RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7 RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000 RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0 R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000 R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last enabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace dd74718fef1ed5c8 ]--- BTRFS info (device sdc): space_info 4 has 268238848 free, is not full BTRFS info (device sdc): space_info total=268435456, used=114688, pinned=0, reserved=16384, may_use=0, readonly=65536 BTRFS info (device sdc): global_block_rsv: size 0 reserved 0 BTRFS info (device sdc): trans_block_rsv: size 0 reserved 0 BTRFS info (device sdc): chunk_block_rsv: size 0 reserved 0 BTRFS info (device sdc): delayed_block_rsv: size 0 reserved 0 BTRFS info (device sdc): delayed_refs_rsv: size 524288 reserved 0 And the crash, which only happens when we do not have crc32c hardware acceleration, produces the following trace immediately after those warnings: stack segment: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI CPU: 2 PID: 1749129 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_queue_work+0x36/0x190 [btrfs] Code: 54 55 53 48 89 f3 (...) RSP: 0018:ffffb27082443ae8 EFLAGS: 00010282 RAX: 0000000000000004 RBX: ffff94810ee9ad90 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffff94810ee9ad90 RDI: ffff947ed8ee75a0 RBP: a56b6b6b6b6b6b6b R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000007 R11: 0000000000000001 R12: ffff947fa9b435a8 R13: ffff94810ee9ad90 R14: 0000000000000000 R15: ffff947e93dc0000 FS: 00007f3cfe974840(0000) GS:ffff9481ac600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1b42995a70 CR3: 0000000127638003 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_wq_submit_bio+0xb3/0xd0 [btrfs] btrfs_submit_metadata_bio+0x44/0xc0 [btrfs] submit_one_bio+0x61/0x70 [btrfs] btree_write_cache_pages+0x414/0x450 [btrfs] ? kobject_put+0x9a/0x1d0 ? trace_hardirqs_on+0x1b/0xf0 ? _raw_spin_unlock_irqrestore+0x3c/0x60 ? free_debug_processing+0x1e1/0x2b0 do_writepages+0x43/0xe0 ? lock_acquired+0x199/0x490 __writeback_single_inode+0x59/0x650 writeback_single_inode+0xaf/0x120 write_inode_now+0x94/0xd0 iput+0x187/0x2b0 close_ctree+0x2c6/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f3cfebabee7 Code: ff 0b 00 f7 d8 64 89 01 (...) RSP: 002b:00007ffc9c9a05f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 00007f3cfecd1264 RCX: 00007f3cfebabee7 RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 0000562b6b478000 RBP: 0000562b6b473a30 R08: 0000000000000000 R09: 00007f3cfec6cbe0 R10: 0000562b6b479fe0 R11: 0000000000000246 R12: 0000000000000000 R13: 0000562b6b478000 R14: 0000562b6b473b40 R15: 0000562b6b473c60 Modules linked in: btrfs dm_snapshot dm_thin_pool (...) ---[ end trace dd74718fef1ed5cc ]--- Finally when we remove the btrfs module (rmmod btrfs), there are several warnings about objects that were allocated from our slabs but were never freed, consequence of the transaction that was never committed and got leaked: ============================================================================= BUG btrfs_delayed_ref_head (Tainted: G B W ): Objects remaining in btrfs_delayed_ref_head on __kmem_cache_shutdown() ----------------------------------------------------------------------------- INFO: Slab 0x0000000094c2ae56 objects=24 used=2 fp=0x000000002bfa2521 flags=0x17fffc000010200 CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 slab_err+0xb7/0xdc ? lock_acquired+0x199/0x490 __kmem_cache_shutdown+0x1ac/0x3c0 ? lock_release+0x20e/0x4c0 kmem_cache_destroy+0x55/0x120 btrfs_delayed_ref_exit+0x11/0x35 [btrfs] exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 INFO: Object 0x0000000050cbdd61 @offset=12104 INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1894 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] btrfs_free_tree_block+0x128/0x360 [btrfs] __btrfs_cow_block+0x489/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=4292 cpu=2 pid=1729526 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] commit_cowonly_roots+0xfb/0x300 [btrfs] btrfs_commit_transaction+0x367/0xc40 [btrfs] sync_filesystem+0x74/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 INFO: Object 0x0000000086e9b0ff @offset=12776 INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1900 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] btrfs_alloc_tree_block+0x2bf/0x360 [btrfs] alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs] __btrfs_cow_block+0x12d/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=3141 cpu=6 pid=1729803 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] btrfs_write_dirty_block_groups+0x17d/0x3d0 [btrfs] commit_cowonly_roots+0x248/0x300 [btrfs] btrfs_commit_transaction+0x367/0xc40 [btrfs] close_ctree+0x113/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 kmem_cache_destroy btrfs_delayed_ref_head: Slab cache still has objects CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 kmem_cache_destroy+0x119/0x120 btrfs_delayed_ref_exit+0x11/0x35 [btrfs] exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 0b (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 ============================================================================= BUG btrfs_delayed_tree_ref (Tainted: G B W ): Objects remaining in btrfs_delayed_tree_ref on __kmem_cache_shutdown() ----------------------------------------------------------------------------- INFO: Slab 0x0000000011f78dc0 objects=37 used=2 fp=0x0000000032d55d91 flags=0x17fffc000010200 CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 slab_err+0xb7/0xdc ? lock_acquired+0x199/0x490 __kmem_cache_shutdown+0x1ac/0x3c0 ? lock_release+0x20e/0x4c0 kmem_cache_destroy+0x55/0x120 btrfs_delayed_ref_exit+0x1d/0x35 [btrfs] exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 INFO: Object 0x000000001a340018 @offset=4408 INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1917 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] btrfs_free_tree_block+0x128/0x360 [btrfs] __btrfs_cow_block+0x489/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=4167 cpu=4 pid=1729795 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] btrfs_commit_transaction+0x60/0xc40 [btrfs] create_subvol+0x56a/0x990 [btrfs] btrfs_mksubvol+0x3fb/0x4a0 [btrfs] __btrfs_ioctl_snap_create+0x119/0x1a0 [btrfs] btrfs_ioctl_snap_create+0x58/0x80 [btrfs] btrfs_ioctl+0x1a92/0x36f0 [btrfs] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 INFO: Object 0x000000002b46292a @offset=13648 INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1923 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] btrfs_alloc_tree_block+0x2bf/0x360 [btrfs] alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs] __btrfs_cow_block+0x12d/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=3164 cpu=6 pid=1729803 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] commit_cowonly_roots+0xfb/0x300 [btrfs] btrfs_commit_transaction+0x367/0xc40 [btrfs] close_ctree+0x113/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 kmem_cache_destroy btrfs_delayed_tree_ref: Slab cache still has objects CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 kmem_cache_destroy+0x119/0x120 btrfs_delayed_ref_exit+0x1d/0x35 [btrfs] exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 ============================================================================= BUG btrfs_delayed_extent_op (Tainted: G B W ): Objects remaining in btrfs_delayed_extent_op on __kmem_cache_shutdown() ----------------------------------------------------------------------------- INFO: Slab 0x00000000f145ce2f objects=22 used=1 fp=0x00000000af0f92cf flags=0x17fffc000010200 CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 slab_err+0xb7/0xdc ? lock_acquired+0x199/0x490 __kmem_cache_shutdown+0x1ac/0x3c0 ? __mutex_unlock_slowpath+0x45/0x2a0 kmem_cache_destroy+0x55/0x120 exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 INFO: Object 0x000000004cf95ea8 @offset=6264 INFO: Allocated in btrfs_alloc_tree_block+0x1e0/0x360 [btrfs] age=1931 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_alloc_tree_block+0x1e0/0x360 [btrfs] alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs] __btrfs_cow_block+0x12d/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] INFO: Freed in __btrfs_run_delayed_refs+0xabd/0x1290 [btrfs] age=3173 cpu=6 pid=1729803 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0xabd/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] commit_cowonly_roots+0xfb/0x300 [btrfs] btrfs_commit_transaction+0x367/0xc40 [btrfs] close_ctree+0x113/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 kmem_cache_destroy btrfs_delayed_extent_op: Slab cache still has objects CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 kmem_cache_destroy+0x119/0x120 exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 BTRFS: state leak: start 30408704 end 30425087 state 1 in tree 1 refs 1 So fix this by calling btrfs_find_orphan_roots() in the mount path only if we are mounting the filesystem in RW mode. It's pointless to have it called for RO mounts anyway, since despite adding any deleted roots to the list of dead roots, we will never have the roots deleted until the filesystem is remounted in RW mode, as the cleaner kthread does nothing when we are mounted in RO - btrfs_need_cleaner_sleep() always returns true and the cleaner spends all time sleeping, never cleaning dead roots. This is accomplished by moving the call to btrfs_find_orphan_roots() from open_ctree() to btrfs_start_pre_rw_mount(), which also guarantees that if later the filesystem is remounted RW, we populate the list of dead roots and have the cleaner task delete the dead roots. Tested-by: Fabian Vogt Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 765deefda92b..e941cbae3991 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2969,6 +2969,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) } } + ret = btrfs_find_orphan_roots(fs_info); out: return ret; } @@ -3383,10 +3384,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } } - ret = btrfs_find_orphan_roots(fs_info); - if (ret) - goto fail_qgroup; - fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); From a0a1db70df5f48576fea6d08f0a69c05f3ab4cf4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 14 Dec 2020 10:10:47 +0000 Subject: [PATCH 071/547] btrfs: fix race between RO remount and the cleaner task When we are remounting a filesystem in RO mode we can race with the cleaner task and result in leaking a transaction if the filesystem is unmounted shortly after, before the transaction kthread had a chance to commit that transaction. That also results in a crash during unmount, due to a use-after-free, if hardware acceleration is not available for crc32c. The following sequence of steps explains how the race happens. 1) The filesystem is mounted in RW mode and the cleaner task is running. This means that currently BTRFS_FS_CLEANER_RUNNING is set at fs_info->flags; 2) The cleaner task is currently running delayed iputs for example; 3) A filesystem RO remount operation starts; 4) The RO remount task calls btrfs_commit_super(), which commits any currently open transaction, and it finishes; 5) At this point the cleaner task is still running and it creates a new transaction by doing one of the following things: * When running the delayed iput() for an inode with a 0 link count, in which case at btrfs_evict_inode() we start a transaction through the call to evict_refill_and_join(), use it and then release its handle through btrfs_end_transaction(); * When deleting a dead root through btrfs_clean_one_deleted_snapshot(), a transaction is started at btrfs_drop_snapshot() and then its handle is released through a call to btrfs_end_transaction_throttle(); * When the remount task was still running, and before the remount task called btrfs_delete_unused_bgs(), the cleaner task also called btrfs_delete_unused_bgs() and it picked and removed one block group from the list of unused block groups. Before the cleaner task started a transaction, through btrfs_start_trans_remove_block_group() at btrfs_delete_unused_bgs(), the remount task had already called btrfs_commit_super(); 6) So at this point the filesystem is in RO mode and we have an open transaction that was started by the cleaner task; 7) Shortly after a filesystem unmount operation starts. At close_ctree() we stop the transaction kthread before it had a chance to commit the transaction, since less than 30 seconds (the default commit interval) have elapsed since the last transaction was committed; 8) We end up calling iput() against the btree inode at close_ctree() while there is an open transaction, and since that transaction was used to update btrees by the cleaner, we have dirty pages in the btree inode due to COW operations on metadata extents, and therefore writeback is triggered for the btree inode. So btree_write_cache_pages() is invoked to flush those dirty pages during the final iput() on the btree inode. This results in creating a bio and submitting it, which makes us end up at btrfs_submit_metadata_bio(); 9) At btrfs_submit_metadata_bio() we end up at the if-then-else branch that calls btrfs_wq_submit_bio(), because check_async_write() returned a value of 1. This value of 1 is because we did not have hardware acceleration available for crc32c, so BTRFS_FS_CSUM_IMPL_FAST was not set in fs_info->flags; 10) Then at btrfs_wq_submit_bio() we call btrfs_queue_work() against the workqueue at fs_info->workers, which was already freed before by the call to btrfs_stop_all_workers() at close_ctree(). This results in an invalid memory access due to a use-after-free, leading to a crash. When this happens, before the crash there are several warnings triggered, since we have reserved metadata space in a block group, the delayed refs reservation, etc: ------------[ cut here ]------------ WARNING: CPU: 4 PID: 1729896 at fs/btrfs/block-group.c:125 btrfs_put_block_group+0x63/0xa0 [btrfs] Modules linked in: btrfs dm_snapshot dm_thin_pool (...) CPU: 4 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_put_block_group+0x63/0xa0 [btrfs] Code: f0 01 00 00 48 39 c2 75 (...) RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206 RAX: 0000000000000001 RBX: ffff947ed73e4000 RCX: ffff947ebc8b29c8 RDX: 0000000000000001 RSI: ffffffffc0b150a0 RDI: ffff947ebc8b2800 RBP: ffff947ebc8b2800 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110 R13: ffff947ed73e4160 R14: ffff947ebc8b2988 R15: dead000000000100 FS: 00007f15edfea840(0000) GS:ffff9481ad600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f37e2893320 CR3: 0000000138f68001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_free_block_groups+0x17f/0x2f0 [btrfs] close_ctree+0x2ba/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f15ee221ee7 Code: ff 0b 00 f7 d8 64 89 01 48 (...) RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7 RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000 RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0 R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000 R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last enabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace dd74718fef1ed5c6 ]--- ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-rsv.c:459 btrfs_release_global_block_rsv+0x70/0xc0 [btrfs] Modules linked in: btrfs dm_snapshot dm_thin_pool (...) CPU: 2 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_release_global_block_rsv+0x70/0xc0 [btrfs] Code: 48 83 bb b0 03 00 00 00 (...) RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206 RAX: 000000000033c000 RBX: ffff947ed73e4000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffffffffc0b0d8c1 RDI: 00000000ffffffff RBP: ffff947ebc8b7000 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110 R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100 FS: 00007f15edfea840(0000) GS:ffff9481aca00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000561a79f76e20 CR3: 0000000138f68006 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_free_block_groups+0x24c/0x2f0 [btrfs] close_ctree+0x2ba/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f15ee221ee7 Code: ff 0b 00 f7 d8 64 89 01 (...) RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7 RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000 RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0 R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000 R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last enabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace dd74718fef1ed5c7 ]--- ------------[ cut here ]------------ WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-group.c:3377 btrfs_free_block_groups+0x25d/0x2f0 [btrfs] Modules linked in: btrfs dm_snapshot dm_thin_pool (...) CPU: 5 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_free_block_groups+0x25d/0x2f0 [btrfs] Code: ad de 49 be 22 01 00 (...) RSP: 0018:ffffb270826bbde8 EFLAGS: 00010206 RAX: ffff947ebeae1d08 RBX: ffff947ed73e4000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffff947e9d823ae8 RDI: 0000000000000246 RBP: ffff947ebeae1d08 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ebeae1c00 R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100 FS: 00007f15edfea840(0000) GS:ffff9481ad200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1475d98ea8 CR3: 0000000138f68005 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: close_ctree+0x2ba/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f15ee221ee7 Code: ff 0b 00 f7 d8 64 89 (...) RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7 RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000 RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0 R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000 R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60 irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last enabled at (0): [] copy_process+0x8a0/0x1d70 softirqs last disabled at (0): [<0000000000000000>] 0x0 ---[ end trace dd74718fef1ed5c8 ]--- BTRFS info (device sdc): space_info 4 has 268238848 free, is not full BTRFS info (device sdc): space_info total=268435456, used=114688, pinned=0, reserved=16384, may_use=0, readonly=65536 BTRFS info (device sdc): global_block_rsv: size 0 reserved 0 BTRFS info (device sdc): trans_block_rsv: size 0 reserved 0 BTRFS info (device sdc): chunk_block_rsv: size 0 reserved 0 BTRFS info (device sdc): delayed_block_rsv: size 0 reserved 0 BTRFS info (device sdc): delayed_refs_rsv: size 524288 reserved 0 And the crash, which only happens when we do not have crc32c hardware acceleration, produces the following trace immediately after those warnings: stack segment: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI CPU: 2 PID: 1749129 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:btrfs_queue_work+0x36/0x190 [btrfs] Code: 54 55 53 48 89 f3 (...) RSP: 0018:ffffb27082443ae8 EFLAGS: 00010282 RAX: 0000000000000004 RBX: ffff94810ee9ad90 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffff94810ee9ad90 RDI: ffff947ed8ee75a0 RBP: a56b6b6b6b6b6b6b R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000007 R11: 0000000000000001 R12: ffff947fa9b435a8 R13: ffff94810ee9ad90 R14: 0000000000000000 R15: ffff947e93dc0000 FS: 00007f3cfe974840(0000) GS:ffff9481ac600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1b42995a70 CR3: 0000000127638003 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_wq_submit_bio+0xb3/0xd0 [btrfs] btrfs_submit_metadata_bio+0x44/0xc0 [btrfs] submit_one_bio+0x61/0x70 [btrfs] btree_write_cache_pages+0x414/0x450 [btrfs] ? kobject_put+0x9a/0x1d0 ? trace_hardirqs_on+0x1b/0xf0 ? _raw_spin_unlock_irqrestore+0x3c/0x60 ? free_debug_processing+0x1e1/0x2b0 do_writepages+0x43/0xe0 ? lock_acquired+0x199/0x490 __writeback_single_inode+0x59/0x650 writeback_single_inode+0xaf/0x120 write_inode_now+0x94/0xd0 iput+0x187/0x2b0 close_ctree+0x2c6/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f3cfebabee7 Code: ff 0b 00 f7 d8 64 89 01 (...) RSP: 002b:00007ffc9c9a05f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 00007f3cfecd1264 RCX: 00007f3cfebabee7 RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 0000562b6b478000 RBP: 0000562b6b473a30 R08: 0000000000000000 R09: 00007f3cfec6cbe0 R10: 0000562b6b479fe0 R11: 0000000000000246 R12: 0000000000000000 R13: 0000562b6b478000 R14: 0000562b6b473b40 R15: 0000562b6b473c60 Modules linked in: btrfs dm_snapshot dm_thin_pool (...) ---[ end trace dd74718fef1ed5cc ]--- Finally when we remove the btrfs module (rmmod btrfs), there are several warnings about objects that were allocated from our slabs but were never freed, consequence of the transaction that was never committed and got leaked: ============================================================================= BUG btrfs_delayed_ref_head (Tainted: G B W ): Objects remaining in btrfs_delayed_ref_head on __kmem_cache_shutdown() ----------------------------------------------------------------------------- INFO: Slab 0x0000000094c2ae56 objects=24 used=2 fp=0x000000002bfa2521 flags=0x17fffc000010200 CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 slab_err+0xb7/0xdc ? lock_acquired+0x199/0x490 __kmem_cache_shutdown+0x1ac/0x3c0 ? lock_release+0x20e/0x4c0 kmem_cache_destroy+0x55/0x120 btrfs_delayed_ref_exit+0x11/0x35 [btrfs] exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 INFO: Object 0x0000000050cbdd61 @offset=12104 INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1894 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] btrfs_free_tree_block+0x128/0x360 [btrfs] __btrfs_cow_block+0x489/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=4292 cpu=2 pid=1729526 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] commit_cowonly_roots+0xfb/0x300 [btrfs] btrfs_commit_transaction+0x367/0xc40 [btrfs] sync_filesystem+0x74/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 INFO: Object 0x0000000086e9b0ff @offset=12776 INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1900 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] btrfs_alloc_tree_block+0x2bf/0x360 [btrfs] alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs] __btrfs_cow_block+0x12d/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=3141 cpu=6 pid=1729803 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] btrfs_write_dirty_block_groups+0x17d/0x3d0 [btrfs] commit_cowonly_roots+0x248/0x300 [btrfs] btrfs_commit_transaction+0x367/0xc40 [btrfs] close_ctree+0x113/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 kmem_cache_destroy btrfs_delayed_ref_head: Slab cache still has objects CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 kmem_cache_destroy+0x119/0x120 btrfs_delayed_ref_exit+0x11/0x35 [btrfs] exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 0b (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 ============================================================================= BUG btrfs_delayed_tree_ref (Tainted: G B W ): Objects remaining in btrfs_delayed_tree_ref on __kmem_cache_shutdown() ----------------------------------------------------------------------------- INFO: Slab 0x0000000011f78dc0 objects=37 used=2 fp=0x0000000032d55d91 flags=0x17fffc000010200 CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 slab_err+0xb7/0xdc ? lock_acquired+0x199/0x490 __kmem_cache_shutdown+0x1ac/0x3c0 ? lock_release+0x20e/0x4c0 kmem_cache_destroy+0x55/0x120 btrfs_delayed_ref_exit+0x1d/0x35 [btrfs] exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 INFO: Object 0x000000001a340018 @offset=4408 INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1917 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] btrfs_free_tree_block+0x128/0x360 [btrfs] __btrfs_cow_block+0x489/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=4167 cpu=4 pid=1729795 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] btrfs_commit_transaction+0x60/0xc40 [btrfs] create_subvol+0x56a/0x990 [btrfs] btrfs_mksubvol+0x3fb/0x4a0 [btrfs] __btrfs_ioctl_snap_create+0x119/0x1a0 [btrfs] btrfs_ioctl_snap_create+0x58/0x80 [btrfs] btrfs_ioctl+0x1a92/0x36f0 [btrfs] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 INFO: Object 0x000000002b46292a @offset=13648 INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1923 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] btrfs_alloc_tree_block+0x2bf/0x360 [btrfs] alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs] __btrfs_cow_block+0x12d/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=3164 cpu=6 pid=1729803 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] commit_cowonly_roots+0xfb/0x300 [btrfs] btrfs_commit_transaction+0x367/0xc40 [btrfs] close_ctree+0x113/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 kmem_cache_destroy btrfs_delayed_tree_ref: Slab cache still has objects CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 kmem_cache_destroy+0x119/0x120 btrfs_delayed_ref_exit+0x1d/0x35 [btrfs] exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 ============================================================================= BUG btrfs_delayed_extent_op (Tainted: G B W ): Objects remaining in btrfs_delayed_extent_op on __kmem_cache_shutdown() ----------------------------------------------------------------------------- INFO: Slab 0x00000000f145ce2f objects=22 used=1 fp=0x00000000af0f92cf flags=0x17fffc000010200 CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 slab_err+0xb7/0xdc ? lock_acquired+0x199/0x490 __kmem_cache_shutdown+0x1ac/0x3c0 ? __mutex_unlock_slowpath+0x45/0x2a0 kmem_cache_destroy+0x55/0x120 exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 f5 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 INFO: Object 0x000000004cf95ea8 @offset=6264 INFO: Allocated in btrfs_alloc_tree_block+0x1e0/0x360 [btrfs] age=1931 cpu=6 pid=1729873 __slab_alloc.isra.0+0x109/0x1c0 kmem_cache_alloc+0x7bb/0x830 btrfs_alloc_tree_block+0x1e0/0x360 [btrfs] alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs] __btrfs_cow_block+0x12d/0x5f0 [btrfs] btrfs_cow_block+0xf7/0x220 [btrfs] btrfs_search_slot+0x62a/0xc40 [btrfs] btrfs_del_orphan_item+0x65/0xd0 [btrfs] btrfs_find_orphan_roots+0x1bf/0x200 [btrfs] open_ctree+0x125a/0x18a0 [btrfs] btrfs_mount_root.cold+0x13/0xed [btrfs] legacy_get_tree+0x30/0x60 vfs_get_tree+0x28/0xe0 fc_mount+0xe/0x40 vfs_kern_mount.part.0+0x71/0x90 btrfs_mount+0x13b/0x3e0 [btrfs] INFO: Freed in __btrfs_run_delayed_refs+0xabd/0x1290 [btrfs] age=3173 cpu=6 pid=1729803 kmem_cache_free+0x34c/0x3c0 __btrfs_run_delayed_refs+0xabd/0x1290 [btrfs] btrfs_run_delayed_refs+0x81/0x210 [btrfs] commit_cowonly_roots+0xfb/0x300 [btrfs] btrfs_commit_transaction+0x367/0xc40 [btrfs] close_ctree+0x113/0x2fa [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x68/0xb0 exit_to_user_mode_prepare+0x1bb/0x1c0 syscall_exit_to_user_mode+0x4b/0x260 entry_SYSCALL_64_after_hwframe+0x44/0xa9 kmem_cache_destroy btrfs_delayed_extent_op: Slab cache still has objects CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8d/0xb5 kmem_cache_destroy+0x119/0x120 exit_btrfs_fs+0xa/0x59 [btrfs] __x64_sys_delete_module+0x194/0x260 ? fpregs_assert_state_consistent+0x1e/0x40 ? exit_to_user_mode_prepare+0x55/0x1c0 ? trace_hardirqs_on+0x1b/0xf0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f693e305897 Code: 73 01 c3 48 8b 0d f9 (...) RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897 RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8 RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000 R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740 R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760 BTRFS: state leak: start 30408704 end 30425087 state 1 in tree 1 refs 1 So fix this by making the remount path to wait for the cleaner task before calling btrfs_commit_super(). The remount path now waits for the bit BTRFS_FS_CLEANER_RUNNING to be cleared from fs_info->flags before calling btrfs_commit_super() and this ensures the cleaner can not start a transaction after that, because it sleeps when the filesystem is in RO mode and we have already flagged the filesystem as RO before waiting for BTRFS_FS_CLEANER_RUNNING to be cleared. This also introduces a new flag BTRFS_FS_STATE_RO to be used for fs_info->fs_state when the filesystem is in RO mode. This is because we were doing the RO check using the flags of the superblock and setting the RO mode simply by ORing into the superblock's flags - those operations are not atomic and could result in the cleaner not seeing the update from the remount task after it clears BTRFS_FS_CLEANER_RUNNING. Tested-by: Fabian Vogt Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 20 +++++++++++++++++++- fs/btrfs/disk-io.c | 5 ++++- fs/btrfs/super.c | 22 +++++++++++++++++++--- fs/btrfs/volumes.c | 4 ++-- 4 files changed, 44 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3935d297d198..0225c5208f44 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -132,6 +132,8 @@ enum { * defrag */ BTRFS_FS_STATE_REMOUNTING, + /* Filesystem in RO mode */ + BTRFS_FS_STATE_RO, /* Track if a transaction abort has been reported on this filesystem */ BTRFS_FS_STATE_TRANS_ABORTED, /* @@ -2892,10 +2894,26 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) * If we remount the fs to be R/O or umount the fs, the cleaner needn't do * anything except sleeping. This function is used to check the status of * the fs. + * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, + * since setting and checking for SB_RDONLY in the superblock's flags is not + * atomic. */ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) { - return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info); + return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || + btrfs_fs_closing(fs_info); +} + +static inline void btrfs_set_sb_rdonly(struct super_block *sb) +{ + sb->s_flags |= SB_RDONLY; + set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +static inline void btrfs_clear_sb_rdonly(struct super_block *sb) +{ + sb->s_flags &= ~SB_RDONLY; + clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); } /* tree mod log functions from ctree.c */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e941cbae3991..e7bcbd0b93ef 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1729,7 +1729,7 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(fs_info); sleep: - clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); + clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); if (kthread_should_park()) kthread_parkme(); if (kthread_should_stop()) @@ -2830,6 +2830,9 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block return -ENOMEM; btrfs_init_delayed_root(fs_info->delayed_root); + if (sb_rdonly(sb)) + set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); + return btrfs_alloc_stripe_hash_table(fs_info); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b24fa62375e0..38740cc2919f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -175,7 +175,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function btrfs_discard_stop(fs_info); /* btrfs handle error by forcing the filesystem readonly */ - sb->s_flags |= SB_RDONLY; + btrfs_set_sb_rdonly(sb); btrfs_info(fs_info, "forced readonly"); /* * Note that a running device replace operation is not canceled here @@ -1953,7 +1953,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) /* avoid complains from lockdep et al. */ up(&fs_info->uuid_tree_rescan_sem); - sb->s_flags |= SB_RDONLY; + btrfs_set_sb_rdonly(sb); /* * Setting SB_RDONLY will put the cleaner thread to @@ -1964,6 +1964,20 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) */ btrfs_delete_unused_bgs(fs_info); + /* + * The cleaner task could be already running before we set the + * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). + * We must make sure that after we finish the remount, i.e. after + * we call btrfs_commit_super(), the cleaner can no longer start + * a transaction - either because it was dropping a dead root, + * running delayed iputs or deleting an unused block group (the + * cleaner picked a block group from the list of unused block + * groups before we were able to in the previous call to + * btrfs_delete_unused_bgs()). + */ + wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, + TASK_UNINTERRUPTIBLE); + btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); btrfs_pause_balance(fs_info); @@ -2014,7 +2028,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; - sb->s_flags &= ~SB_RDONLY; + btrfs_clear_sb_rdonly(sb); set_bit(BTRFS_FS_OPEN, &fs_info->flags); } @@ -2036,6 +2050,8 @@ restore: /* We've hit an error - don't reset SB_RDONLY */ if (sb_rdonly(sb)) old_flags |= SB_RDONLY; + if (!(old_flags & SB_RDONLY)) + clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); sb->s_flags = old_flags; fs_info->mount_opt = old_opts; fs_info->compress_type = old_compress_type; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7930e1c78c45..2c0aa03b6437 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2593,7 +2593,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { - sb->s_flags &= ~SB_RDONLY; + btrfs_clear_sb_rdonly(sb); ret = btrfs_prepare_sprout(fs_info); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2729,7 +2729,7 @@ error_sysfs: mutex_unlock(&fs_info->fs_devices->device_list_mutex); error_trans: if (seeding_dev) - sb->s_flags |= SB_RDONLY; + btrfs_set_sb_rdonly(sb); if (trans) btrfs_end_transaction(trans); error_free_zone: From 0a31daa4b602ff6861fdf182236d64b2a353bace Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 14 Dec 2020 10:10:48 +0000 Subject: [PATCH 072/547] btrfs: add assertion for empty list of transactions at late stage of umount Add an assertion to close_ctree(), after destroying all the work queues, to verify we do not have any transaction still open or committing at that at that point. If we have any, it means something is seriously wrong and that can cause memory leaks and use-after-free problems. This is motivated by the previous patches that fixed bugs where we ended up leaking an open transaction after unmounting the filesystem. Tested-by: Fabian Vogt Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e7bcbd0b93ef..1dfd4b2d0e1e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4181,6 +4181,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); + /* We shouldn't have any transaction open at this point */ + ASSERT(list_empty(&fs_info->trans_list)); + clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); btrfs_free_fs_roots(fs_info); From a8cc263eb58ca133617662a5a5e07131d0ebf299 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 14 Dec 2020 10:10:49 +0000 Subject: [PATCH 073/547] btrfs: run delayed iputs when remounting RO to avoid leaking them When remounting RO, after setting the superblock with the RO flag, the cleaner task will start sleeping and do nothing, since the call to btrfs_need_cleaner_sleep() keeps returning 'true'. However, when the cleaner task goes to sleep, the list of delayed iputs may not be empty. As long as we are in RO mode, the cleaner task will keep sleeping and never run the delayed iputs. This means that if a filesystem unmount is started, we get into close_ctree() with a non-empty list of delayed iputs, and because the filesystem is in RO mode and is not in an error state (or a transaction aborted), btrfs_error_commit_super() and btrfs_commit_super(), which run the delayed iputs, are never called, and later we fail the assertion that checks if the delayed iputs list is empty: assertion failed: list_empty(&fs_info->delayed_iputs), in fs/btrfs/disk-io.c:4049 ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.h:3153! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI CPU: 1 PID: 3780621 Comm: umount Tainted: G L 5.6.0-rc2-btrfs-next-73 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014 RIP: 0010:assertfail.constprop.0+0x18/0x26 [btrfs] Code: 8b 7b 58 48 85 ff 74 (...) RSP: 0018:ffffb748c89bbdf8 EFLAGS: 00010246 RAX: 0000000000000051 RBX: ffff9608f2584000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff91998988 RDI: 00000000ffffffff RBP: ffff9608f25870d8 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffffc0cbc500 R13: ffffffff92411750 R14: 0000000000000000 R15: ffff9608f2aab250 FS: 00007fcbfaa66c80(0000) GS:ffff960936c80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fffc2c2dd38 CR3: 0000000235e54002 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: close_ctree+0x1a2/0x2e6 [btrfs] generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 [btrfs] deactivate_locked_super+0x31/0x70 cleanup_mnt+0x100/0x160 task_work_run+0x93/0xc0 exit_to_usermode_loop+0xf9/0x100 do_syscall_64+0x20d/0x260 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fcbfaca6307 Code: eb 0b 00 f7 d8 64 89 (...) RSP: 002b:00007fffc2c2ed68 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 RAX: 0000000000000000 RBX: 0000558203b559b0 RCX: 00007fcbfaca6307 RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000558203b55bc0 RBP: 0000000000000000 R08: 0000000000000001 R09: 00007fffc2c2dad0 R10: 0000558203b55bf0 R11: 0000000000000246 R12: 0000558203b55bc0 R13: 00007fcbfadcc204 R14: 0000558203b55aa8 R15: 0000000000000000 Modules linked in: btrfs dm_flakey dm_log_writes (...) ---[ end trace d44d303790049ef6 ]--- So fix this by making the remount RO path run any remaining delayed iputs after waiting for the cleaner to become inactive. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 38740cc2919f..12d7d3be7cd4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1978,6 +1978,16 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE); + /* + * We've set the superblock to RO mode, so we might have made + * the cleaner task sleep without running all pending delayed + * iputs. Go through all the delayed iputs here, so that if an + * unmount happens without remounting RW we don't end up at + * finishing close_ctree() with a non-empty list of delayed + * iputs. + */ + btrfs_run_delayed_iputs(fs_info); + btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); btrfs_pause_balance(fs_info); From f09ced4053bc0a2094a12b60b646114c966ef4c6 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 18 Dec 2020 14:45:24 +0100 Subject: [PATCH 074/547] xsk: Fix race in SKB mode transmit with shared cq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a race when multiple sockets are simultaneously calling sendto() when the completion ring is shared in the SKB case. This is the case when you share the same netdev and queue id through the XDP_SHARED_UMEM bind flag. The problem is that multiple processes can be in xsk_generic_xmit() and call the backpressure mechanism in xskq_prod_reserve(xs->pool->cq). As this is a shared resource in this specific scenario, a race might occur since the rings are single-producer single-consumer. Fix this by moving the tx_completion_lock from the socket to the pool as the pool is shared between the sockets that share the completion ring. (The pool is not shared when this is not the case.) And then protect the accesses to xskq_prod_reserve() with this lock. The tx_completion_lock is renamed cq_lock to better reflect that it protects accesses to the potentially shared completion ring. Fixes: 35fcde7f8deb ("xsk: support for Tx") Reported-by: Xuan Zhuo Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20201218134525.13119-2-magnus.karlsson@gmail.com --- include/net/xdp_sock.h | 4 ---- include/net/xsk_buff_pool.h | 5 +++++ net/xdp/xsk.c | 9 ++++++--- net/xdp/xsk_buff_pool.c | 1 + 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 4f4e93bf814c..cc17bc957548 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -58,10 +58,6 @@ struct xdp_sock { struct xsk_queue *tx ____cacheline_aligned_in_smp; struct list_head tx_list; - /* Mutual exclusion of NAPI TX thread and sendmsg error paths - * in the SKB destructor callback. - */ - spinlock_t tx_completion_lock; /* Protects generic receive. */ spinlock_t rx_lock; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 01755b838c74..eaa8386dbc63 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -73,6 +73,11 @@ struct xsk_buff_pool { bool dma_need_sync; bool unaligned; void *addrs; + /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: + * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when + * sockets share a single cq when the same netdev and queue id is shared. + */ + spinlock_t cq_lock; struct xdp_buff_xsk *free_heads[]; }; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c6532d77fde7..d531f9cd0de6 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -423,9 +423,9 @@ static void xsk_destruct_skb(struct sk_buff *skb) struct xdp_sock *xs = xdp_sk(skb->sk); unsigned long flags; - spin_lock_irqsave(&xs->tx_completion_lock, flags); + spin_lock_irqsave(&xs->pool->cq_lock, flags); xskq_prod_submit_addr(xs->pool->cq, addr); - spin_unlock_irqrestore(&xs->tx_completion_lock, flags); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); sock_wfree(skb); } @@ -437,6 +437,7 @@ static int xsk_generic_xmit(struct sock *sk) bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; + unsigned long flags; int err = 0; mutex_lock(&xs->mutex); @@ -468,10 +469,13 @@ static int xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ + spin_lock_irqsave(&xs->pool->cq_lock, flags); if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) { + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); kfree_skb(skb); goto out; } + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); skb->dev = xs->dev; skb->priority = sk->sk_priority; @@ -1303,7 +1307,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs->state = XSK_READY; mutex_init(&xs->mutex); spin_lock_init(&xs->rx_lock); - spin_lock_init(&xs->tx_completion_lock); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 818b75060922..20598eea658c 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -71,6 +71,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); + spin_lock_init(&pool->cq_lock); refcount_set(&pool->users, 1); pool->fq = xs->fq_tmp; From b1b95cb5c0a9694d47d5f845ba97e226cfda957d Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 18 Dec 2020 14:45:25 +0100 Subject: [PATCH 075/547] xsk: Rollback reservation at NETDEV_TX_BUSY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rollback the reservation in the completion ring when we get a NETDEV_TX_BUSY. When this error is received from the driver, we are supposed to let the user application retry the transmit again. And in order to do this, we need to roll back the failed send so it can be retried. Unfortunately, we did not cancel the reservation we had made in the completion ring. By not doing this, we actually make the completion ring one entry smaller per NETDEV_TX_BUSY error we get, and after enough of these errors the completion ring will be of size zero and transmit will stop working. Fix this by cancelling the reservation when we get a NETDEV_TX_BUSY error. Fixes: 642e450b6b59 ("xsk: Do not discard packet when NETDEV_TX_BUSY") Reported-by: Xuan Zhuo Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20201218134525.13119-3-magnus.karlsson@gmail.com --- net/xdp/xsk.c | 3 +++ net/xdp/xsk_queue.h | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index d531f9cd0de6..8037b04a9edd 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -487,6 +487,9 @@ static int xsk_generic_xmit(struct sock *sk) if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ skb->destructor = sock_wfree; + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel(xs->pool->cq); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); /* Free skb without triggering the perf drop trace */ consume_skb(skb); err = -EAGAIN; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 4a9663aa7afe..2823b7c3302d 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -334,6 +334,11 @@ static inline bool xskq_prod_is_full(struct xsk_queue *q) return xskq_prod_nb_free(q, 1) ? false : true; } +static inline void xskq_prod_cancel(struct xsk_queue *q) +{ + q->cached_prod--; +} + static inline int xskq_prod_reserve(struct xsk_queue *q) { if (xskq_prod_is_full(q)) From e79bb299ccad6983876686a4d8c87c92ebbe5657 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 14 Dec 2020 22:35:39 +0000 Subject: [PATCH 076/547] selftests/bpf: Fix spelling mistake "tranmission" -> "transmission" There are two spelling mistakes in output messages. Fix these. Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201214223539.83168-1-colin.king@canonical.com --- tools/testing/selftests/bpf/xdpxceiver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 014dedaa4dd2..1e722ee76b1f 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -715,7 +715,7 @@ static void worker_pkt_dump(void) int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE)); if (payload == EOT) { - ksft_print_msg("End-of-tranmission frame received\n"); + ksft_print_msg("End-of-transmission frame received\n"); fprintf(stdout, "---------------------------------------\n"); break; } @@ -747,7 +747,7 @@ static void worker_pkt_validate(void) } if (payloadseqnum == EOT) { - ksft_print_msg("End-of-tranmission frame received: PASS\n"); + ksft_print_msg("End-of-transmission frame received: PASS\n"); sigvar = 1; break; } From d467d80dc399ba77875d647f2f37b7d1a70d94c2 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Wed, 16 Dec 2020 10:47:15 +0800 Subject: [PATCH 077/547] bpf: Remove unused including Remove including that don't need it. Signed-off-by: Tian Tao Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1608086835-54523-1-git-send-email-tiantao6@hisilicon.com --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 287be337d5f6..bb2700ec5bf3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include From 4aa1464acbe3697710279a4bd65cb4801ed30425 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 17 Dec 2020 14:29:11 -0800 Subject: [PATCH 078/547] spi: spi-geni-qcom: Fix geni_spi_isr() NULL dereference in timeout case In commit 7ba9bdcb91f6 ("spi: spi-geni-qcom: Don't keep a local state variable") we changed handle_fifo_timeout() so that we set "mas->cur_xfer" to NULL to make absolutely sure that we don't mess with the buffers from the previous transfer in the timeout case. Unfortunately, this caused the IRQ handler to dereference NULL in some cases. One case: CPU0 CPU1 ---- ---- setup_fifo_xfer() geni_se_setup_m_cmd() ... handle_fifo_timeout() spin_lock_irq(mas->lock) mas->cur_xfer = NULL geni_se_cancel_m_cmd() spin_unlock_irq(mas->lock) geni_spi_isr() spin_lock(mas->lock) if (m_irq & M_RX_FIFO_WATERMARK_EN) geni_spi_handle_rx() mas->cur_xfer NULL dereference! tl;dr: Seriously delayed interrupts for RX/TX can lead to timeout handling setting mas->cur_xfer to NULL. Let's check for the NULL transfer in the TX and RX cases and reset the watermark or clear out the fifo respectively to put the hardware back into a sane state. NOTE: things still could get confused if we get timeouts all the way through handle_fifo_timeout() and then start a new transfer because interrupts from the old transfer / cancel / abort could still be pending. A future patch will help this corner case. Fixes: 561de45f72bd ("spi: spi-geni-qcom: Add SPI driver support for GENI based QUP") Signed-off-by: Douglas Anderson Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20201217142842.v3.1.I99ee04f0cb823415df59bd4f550d6ff5756e43d6@changeid Signed-off-by: Mark Brown --- drivers/spi/spi-geni-qcom.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c index 512e925d5ea4..096edfbde451 100644 --- a/drivers/spi/spi-geni-qcom.c +++ b/drivers/spi/spi-geni-qcom.c @@ -354,6 +354,12 @@ static bool geni_spi_handle_tx(struct spi_geni_master *mas) unsigned int bytes_per_fifo_word = geni_byte_per_fifo_word(mas); unsigned int i = 0; + /* Stop the watermark IRQ if nothing to send */ + if (!mas->cur_xfer) { + writel(0, se->base + SE_GENI_TX_WATERMARK_REG); + return false; + } + max_bytes = (mas->tx_fifo_depth - mas->tx_wm) * bytes_per_fifo_word; if (mas->tx_rem_bytes < max_bytes) max_bytes = mas->tx_rem_bytes; @@ -396,6 +402,14 @@ static void geni_spi_handle_rx(struct spi_geni_master *mas) if (rx_last_byte_valid && rx_last_byte_valid < 4) rx_bytes -= bytes_per_fifo_word - rx_last_byte_valid; } + + /* Clear out the FIFO and bail if nowhere to put it */ + if (!mas->cur_xfer) { + for (i = 0; i < DIV_ROUND_UP(rx_bytes, bytes_per_fifo_word); i++) + readl(se->base + SE_GENI_RX_FIFOn); + return; + } + if (mas->rx_rem_bytes < rx_bytes) rx_bytes = mas->rx_rem_bytes; From 690d8b917bbe64772cb0b652311bcd50908aea6b Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 17 Dec 2020 14:29:12 -0800 Subject: [PATCH 079/547] spi: spi-geni-qcom: Fail new xfers if xfer/cancel/abort pending If we got a timeout when trying to send an abort command then it means that we just got 3 timeouts in a row: 1. The original timeout that caused handle_fifo_timeout() to be called. 2. A one second timeout waiting for the cancel command to finish. 3. A one second timeout waiting for the abort command to finish. SPI is clocked by the controller, so nothing (aside from a hardware fault or a totally broken sequencer) should be causing the actual commands to fail in hardware. However, even though the hardware itself is not expected to fail (and it'd be hard to predict how we should handle things if it did), it's easy to hit the timeout case by simply blocking our interrupt handler from running for a long period of time. Obviously the system is in pretty bad shape if a interrupt handler is blocked for > 2 seconds, but there are certainly bugs (even bugs in other unrelated drivers) that can make this happen. Let's make things a bit more robust against this case. If we fail to abort we'll set a flag and then we'll block all future transfers until we have no more interrupts pending. Fixes: 561de45f72bd ("spi: spi-geni-qcom: Add SPI driver support for GENI based QUP") Signed-off-by: Douglas Anderson Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20201217142842.v3.2.Ibade998ed587e070388b4bf58801f1107a40eb53@changeid Signed-off-by: Mark Brown --- drivers/spi/spi-geni-qcom.c | 59 +++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c index 096edfbde451..32c053705dec 100644 --- a/drivers/spi/spi-geni-qcom.c +++ b/drivers/spi/spi-geni-qcom.c @@ -83,6 +83,7 @@ struct spi_geni_master { spinlock_t lock; int irq; bool cs_flag; + bool abort_failed; }; static int get_spi_clk_cfg(unsigned int speed_hz, @@ -141,8 +142,49 @@ static void handle_fifo_timeout(struct spi_master *spi, spin_unlock_irq(&mas->lock); time_left = wait_for_completion_timeout(&mas->abort_done, HZ); - if (!time_left) + if (!time_left) { dev_err(mas->dev, "Failed to cancel/abort m_cmd\n"); + + /* + * No need for a lock since SPI core has a lock and we never + * access this from an interrupt. + */ + mas->abort_failed = true; + } +} + +static bool spi_geni_is_abort_still_pending(struct spi_geni_master *mas) +{ + struct geni_se *se = &mas->se; + u32 m_irq, m_irq_en; + + if (!mas->abort_failed) + return false; + + /* + * The only known case where a transfer times out and then a cancel + * times out then an abort times out is if something is blocking our + * interrupt handler from running. Avoid starting any new transfers + * until that sorts itself out. + */ + spin_lock_irq(&mas->lock); + m_irq = readl(se->base + SE_GENI_M_IRQ_STATUS); + m_irq_en = readl(se->base + SE_GENI_M_IRQ_EN); + spin_unlock_irq(&mas->lock); + + if (m_irq & m_irq_en) { + dev_err(mas->dev, "Interrupts pending after abort: %#010x\n", + m_irq & m_irq_en); + return true; + } + + /* + * If we're here the problem resolved itself so no need to check more + * on future transfers. + */ + mas->abort_failed = false; + + return false; } static void spi_geni_set_cs(struct spi_device *slv, bool set_flag) @@ -158,9 +200,15 @@ static void spi_geni_set_cs(struct spi_device *slv, bool set_flag) if (set_flag == mas->cs_flag) return; + pm_runtime_get_sync(mas->dev); + + if (spi_geni_is_abort_still_pending(mas)) { + dev_err(mas->dev, "Can't set chip select\n"); + goto exit; + } + mas->cs_flag = set_flag; - pm_runtime_get_sync(mas->dev); spin_lock_irq(&mas->lock); reinit_completion(&mas->cs_done); if (set_flag) @@ -173,6 +221,7 @@ static void spi_geni_set_cs(struct spi_device *slv, bool set_flag) if (!time_left) handle_fifo_timeout(spi, NULL); +exit: pm_runtime_put(mas->dev); } @@ -280,6 +329,9 @@ static int spi_geni_prepare_message(struct spi_master *spi, int ret; struct spi_geni_master *mas = spi_master_get_devdata(spi); + if (spi_geni_is_abort_still_pending(mas)) + return -EBUSY; + ret = setup_fifo_params(spi_msg->spi, spi); if (ret) dev_err(mas->dev, "Couldn't select mode %d\n", ret); @@ -509,6 +561,9 @@ static int spi_geni_transfer_one(struct spi_master *spi, { struct spi_geni_master *mas = spi_master_get_devdata(spi); + if (spi_geni_is_abort_still_pending(mas)) + return -EBUSY; + /* Terminate and return success for 0 byte length transfer */ if (!xfer->len) return 0; From 3d7d916f9bc98ce88272b3e4405c7c685afbfcd6 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 17 Dec 2020 14:29:13 -0800 Subject: [PATCH 080/547] spi: spi-geni-qcom: Don't try to set CS if an xfer is pending If we get a timeout sending then this happens: spi_transfer_one_message() ->transfer_one() AKA spi_geni_transfer_one() setup_fifo_xfer() mas->cur_xfer = non-NULL spi_transfer_wait() => TIMES OUT if (msg->status != -EINPROGRESS) goto out if (ret != 0 ...) spi_set_cs() ->set_cs AKA spi_geni_set_cs() # mas->cur_xfer is non-NULL The above happens _before_ the SPI core calls ->handle_err() AKA handle_fifo_timeout(). Unfortunately that won't work so well on geni. If we got a timeout transferring then it's likely that our interrupt handler is blocked, but we need that same interrupt handler to run and the command channel to be unblocked in order to adjust the chip select. Trying to set the chip select doesn't crash us but ends up confusing our state machine and leads to messages like: Premature done. rx_rem = 32 bpw8 Let's just drop the chip select request in this case. We can detect the case because cur_xfer is non-NULL--it would have been set to NULL in the interrupt handler if the previous transfer had finished. Sure, we might leave the chip select in the wrong state but it's likely it was going to fail anyway and this avoids getting the driver even more confused about what it's doing. The SPI core in general assumes that setting chip select is a simple operation that doesn't fail. Yet another reason to just reconfigure the chip select line as GPIOs. Signed-off-by: Douglas Anderson Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20201217142842.v3.3.I07afdedcc49655c5d26880f8df9170aac5792378@changeid Signed-off-by: Mark Brown --- drivers/spi/spi-geni-qcom.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c index 32c053705dec..365565d3a998 100644 --- a/drivers/spi/spi-geni-qcom.c +++ b/drivers/spi/spi-geni-qcom.c @@ -207,9 +207,14 @@ static void spi_geni_set_cs(struct spi_device *slv, bool set_flag) goto exit; } - mas->cs_flag = set_flag; - spin_lock_irq(&mas->lock); + if (mas->cur_xfer) { + dev_err(mas->dev, "Can't set CS when prev xfer running\n"); + spin_unlock_irq(&mas->lock); + goto exit; + } + + mas->cs_flag = set_flag; reinit_completion(&mas->cs_done); if (set_flag) geni_se_setup_m_cmd(se, SPI_CS_ASSERT, 0); From 17fa81aa702ec118f2b835715897041675b06336 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 17 Dec 2020 14:29:14 -0800 Subject: [PATCH 081/547] spi: spi-geni-qcom: Print an error when we timeout setting the CS If we're using geni to manage the chip select line (don't do it--use a GPIO!) and we happen to get a timeout waiting for the chip select command to be completed, no errors are printed even though things might not be in the best shape. Let's add a print. Signed-off-by: Douglas Anderson Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20201217142842.v3.4.I666b37646de9652cef438ac7c2c6c2053367fc6b@changeid Signed-off-by: Mark Brown --- drivers/spi/spi-geni-qcom.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-geni-qcom.c b/drivers/spi/spi-geni-qcom.c index 365565d3a998..881f645661cc 100644 --- a/drivers/spi/spi-geni-qcom.c +++ b/drivers/spi/spi-geni-qcom.c @@ -223,8 +223,10 @@ static void spi_geni_set_cs(struct spi_device *slv, bool set_flag) spin_unlock_irq(&mas->lock); time_left = wait_for_completion_timeout(&mas->cs_done, HZ); - if (!time_left) + if (!time_left) { + dev_warn(mas->dev, "Timeout setting chip select\n"); handle_fifo_timeout(spi, NULL); + } exit: pm_runtime_put(mas->dev); From abdcd06c4dedbcabaec68c433c7f53f33307811f Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Wed, 16 Dec 2020 09:28:04 +0200 Subject: [PATCH 082/547] net: af_packet: fix procfs header for 64-bit pointers On 64-bit systems the packet procfs header field names following 'sk' are not aligned correctly: sk RefCnt Type Proto Iface R Rmem User Inode 00000000605d2c64 3 3 0003 7 1 450880 0 16643 00000000080e9b80 2 2 0000 0 0 0 0 17404 00000000b23b8a00 2 2 0000 0 0 0 0 17421 ... With this change field names are correctly aligned: sk RefCnt Type Proto Iface R Rmem User Inode 000000005c3b1d97 3 3 0003 7 1 21568 0 16178 000000007be55bb7 3 3 fbce 8 1 0 0 16250 00000000be62127d 3 3 fbcd 8 1 0 0 16254 ... Signed-off-by: Baruch Siach Link: https://lore.kernel.org/r/54917251d8433735d9a24e935a6cb8eb88b4058a.1608103684.git.baruch@tkos.co.il Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index de8e8dbbdeb8..6bbc7a448593 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4595,7 +4595,9 @@ static void packet_seq_stop(struct seq_file *seq, void *v) static int packet_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) - seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); + seq_printf(seq, + "%*sRefCnt Type Proto Iface R Rmem User Inode\n", + IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk"); else { struct sock *s = sk_entry(v); const struct packet_sock *po = pkt_sk(s); From 698285da79f5b0b099db15a37ac661ac408c80eb Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 17 Dec 2020 22:29:46 +0100 Subject: [PATCH 083/547] net/sched: sch_taprio: ensure to reset/destroy all child qdiscs taprio_graft() can insert a NULL element in the array of child qdiscs. As a consquence, taprio_reset() might not reset child qdiscs completely, and taprio_destroy() might leak resources. Fix it by ensuring that loops that iterate over q->qdiscs[] don't end when they find the first NULL item. Fixes: 44d4775ca518 ("net/sched: sch_taprio: reset child qdiscs before freeing them") Fixes: 5a781ccbd19e ("tc: Add support for configuring the taprio scheduler") Suggested-by: Jakub Kicinski Signed-off-by: Davide Caratti Link: https://lore.kernel.org/r/13edef6778fef03adc751582562fba4a13e06d6a.1608240532.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index c74817ec9964..6f775275826a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1605,8 +1605,9 @@ static void taprio_reset(struct Qdisc *sch) hrtimer_cancel(&q->advance_timer); if (q->qdiscs) { - for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) - qdisc_reset(q->qdiscs[i]); + for (i = 0; i < dev->num_tx_queues; i++) + if (q->qdiscs[i]) + qdisc_reset(q->qdiscs[i]); } sch->qstats.backlog = 0; sch->q.qlen = 0; @@ -1626,7 +1627,7 @@ static void taprio_destroy(struct Qdisc *sch) taprio_disable_offload(dev, q, NULL); if (q->qdiscs) { - for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) + for (i = 0; i < dev->num_tx_queues; i++) qdisc_put(q->qdiscs[i]); kfree(q->qdiscs); From 87508224485323ce2d4e7fb929ec80f51adcc238 Mon Sep 17 00:00:00 2001 From: Stefan Chulski Date: Thu, 17 Dec 2020 16:52:15 +0200 Subject: [PATCH 084/547] net: mvpp2: disable force link UP during port init procedure Force link UP can be enabled by bootloader during tftpboot and breaks NFS support. Force link UP disabled during port init procedure. Fixes: f84bf386f395 ("net: mvpp2: initialize the GoP") Signed-off-by: Stefan Chulski Acked-by: Marcin Wojtas Link: https://lore.kernel.org/r/1608216735-14501-1-git-send-email-stefanc@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index afdd22827223..0f18d034ce3e 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -5487,7 +5487,7 @@ static int mvpp2_port_init(struct mvpp2_port *port) struct mvpp2 *priv = port->priv; struct mvpp2_txq_pcpu *txq_pcpu; unsigned int thread; - int queue, err; + int queue, err, val; /* Checks for hardware constraints */ if (port->first_rxq + port->nrxqs > @@ -5501,6 +5501,18 @@ static int mvpp2_port_init(struct mvpp2_port *port) mvpp2_egress_disable(port); mvpp2_port_disable(port); + if (mvpp2_is_xlg(port->phy_interface)) { + val = readl(port->base + MVPP22_XLG_CTRL0_REG); + val &= ~MVPP22_XLG_CTRL0_FORCE_LINK_PASS; + val |= MVPP22_XLG_CTRL0_FORCE_LINK_DOWN; + writel(val, port->base + MVPP22_XLG_CTRL0_REG); + } else { + val = readl(port->base + MVPP2_GMAC_AUTONEG_CONFIG); + val &= ~MVPP2_GMAC_FORCE_LINK_PASS; + val |= MVPP2_GMAC_FORCE_LINK_DOWN; + writel(val, port->base + MVPP2_GMAC_AUTONEG_CONFIG); + } + port->tx_time_coal = MVPP2_TXDONE_COAL_USEC; port->txqs = devm_kcalloc(dev, port->ntxqs, sizeof(*port->txqs), From 3f48fab62bb81a7f9d01e9d43c40395fad011dd5 Mon Sep 17 00:00:00 2001 From: Stefan Chulski Date: Thu, 17 Dec 2020 20:30:17 +0200 Subject: [PATCH 085/547] net: mvpp2: Add TCAM entry to drop flow control pause frames Issue: Flow control frame used to pause GoP(MAC) was delivered to the CPU and created a load on the CPU. Since XOFF/XON frames are used only by MAC, these frames should be dropped inside MAC. Fix: According to 802.3-2012 - IEEE Standard for Ethernet pause frame has unique destination MAC address 01-80-C2-00-00-01. Add TCAM parser entry to track and drop pause frames by destination MAC. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Signed-off-by: Stefan Chulski Link: https://lore.kernel.org/r/1608229817-21951-1-git-send-email-stefanc@marvell.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/mvpp2/mvpp2_prs.c | 33 +++++++++++++++++++ .../net/ethernet/marvell/mvpp2/mvpp2_prs.h | 2 +- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c index 5692c6087bbb..f069593d7e50 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c @@ -405,6 +405,38 @@ static int mvpp2_prs_tcam_first_free(struct mvpp2 *priv, unsigned char start, return -EINVAL; } +/* Drop flow control pause frames */ +static void mvpp2_prs_drop_fc(struct mvpp2 *priv) +{ + unsigned char da[ETH_ALEN] = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x01 }; + struct mvpp2_prs_entry pe; + unsigned int len; + + memset(&pe, 0, sizeof(pe)); + + /* For all ports - drop flow control frames */ + pe.index = MVPP2_PE_FC_DROP; + mvpp2_prs_tcam_lu_set(&pe, MVPP2_PRS_LU_MAC); + + /* Set match on DA */ + len = ETH_ALEN; + while (len--) + mvpp2_prs_tcam_data_byte_set(&pe, len, da[len], 0xff); + + mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_DROP_MASK, + MVPP2_PRS_RI_DROP_MASK); + + mvpp2_prs_sram_bits_set(&pe, MVPP2_PRS_SRAM_LU_GEN_BIT, 1); + mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_FLOWS); + + /* Mask all ports */ + mvpp2_prs_tcam_port_map_set(&pe, MVPP2_PRS_PORT_MASK); + + /* Update shadow table and hw entry */ + mvpp2_prs_shadow_set(priv, pe.index, MVPP2_PRS_LU_MAC); + mvpp2_prs_hw_write(priv, &pe); +} + /* Enable/disable dropping all mac da's */ static void mvpp2_prs_mac_drop_all_set(struct mvpp2 *priv, int port, bool add) { @@ -1162,6 +1194,7 @@ static void mvpp2_prs_mac_init(struct mvpp2 *priv) mvpp2_prs_hw_write(priv, &pe); /* Create dummy entries for drop all and promiscuous modes */ + mvpp2_prs_drop_fc(priv); mvpp2_prs_mac_drop_all_set(priv, 0, false); mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_UNI_CAST, false); mvpp2_prs_mac_promisc_set(priv, 0, MVPP2_PRS_L2_MULTI_CAST, false); diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.h index e22f6c85d380..4b68dd374733 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.h +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.h @@ -129,7 +129,7 @@ #define MVPP2_PE_VID_EDSA_FLTR_DEFAULT (MVPP2_PRS_TCAM_SRAM_SIZE - 7) #define MVPP2_PE_VLAN_DBL (MVPP2_PRS_TCAM_SRAM_SIZE - 6) #define MVPP2_PE_VLAN_NONE (MVPP2_PRS_TCAM_SRAM_SIZE - 5) -/* reserved */ +#define MVPP2_PE_FC_DROP (MVPP2_PRS_TCAM_SRAM_SIZE - 4) #define MVPP2_PE_MAC_MC_PROMISCUOUS (MVPP2_PRS_TCAM_SRAM_SIZE - 3) #define MVPP2_PE_MAC_UC_PROMISCUOUS (MVPP2_PRS_TCAM_SRAM_SIZE - 2) #define MVPP2_PE_MAC_NON_PROMISCUOUS (MVPP2_PRS_TCAM_SRAM_SIZE - 1) From fec6079b2eeab319d9e3d074f54d3b6f623e9701 Mon Sep 17 00:00:00 2001 From: Stefan Chulski Date: Thu, 17 Dec 2020 20:37:46 +0200 Subject: [PATCH 086/547] net: mvpp2: prs: fix PPPoE with ipv6 packet parse Current PPPoE+IPv6 entry is jumping to 'next-hdr' field and not to 'DIP' field as done for IPv4. Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") Reported-by: Liron Himi Signed-off-by: Stefan Chulski Link: https://lore.kernel.org/r/1608230266-22111-1-git-send-email-stefanc@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c index f069593d7e50..a30eb90ba3d2 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_prs.c @@ -1680,8 +1680,9 @@ static int mvpp2_prs_pppoe_init(struct mvpp2 *priv) mvpp2_prs_sram_next_lu_set(&pe, MVPP2_PRS_LU_IP6); mvpp2_prs_sram_ri_update(&pe, MVPP2_PRS_RI_L3_IP6, MVPP2_PRS_RI_L3_PROTO_MASK); - /* Skip eth_type + 4 bytes of IPv6 header */ - mvpp2_prs_sram_shift_set(&pe, MVPP2_ETH_TYPE_LEN + 4, + /* Jump to DIP of IPV6 header */ + mvpp2_prs_sram_shift_set(&pe, MVPP2_ETH_TYPE_LEN + 8 + + MVPP2_MAX_L3_ADDR_SIZE, MVPP2_PRS_SRAM_OP_SEL_SHIFT_ADD); /* Set L3 offset */ mvpp2_prs_sram_offset_set(&pe, MVPP2_PRS_SRAM_UDF_TYPE_L3, From e16ab3db87b3d5d4118dfb68e955f62c4e09573a Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 5 Dec 2020 11:35:25 +0100 Subject: [PATCH 087/547] mt76: usb: remove wake logic in mt76u_status_worker Similar to mmio code path, remove wake logic in mt76u_status_worker handler. Starting from commit 90d494c99a99 ("mt76: improve tx queue stop/wake")', the wake queue logic on the usb status path is no longer necessary since the hw queues are no longer stopped on the mt76 tx path. Signed-off-by: Lorenzo Bianconi Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/00009bf0cfdc9565e4432cad3ed51888c667c25d.1607164041.git.lorenzo@kernel.org --- drivers/net/wireless/mediatek/mt76/usb.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/usb.c b/drivers/net/wireless/mediatek/mt76/usb.c index 8b08cad131c8..b95d093728b9 100644 --- a/drivers/net/wireless/mediatek/mt76/usb.c +++ b/drivers/net/wireless/mediatek/mt76/usb.c @@ -811,7 +811,6 @@ static void mt76u_status_worker(struct mt76_worker *w) struct mt76_dev *dev = container_of(usb, struct mt76_dev, usb); struct mt76_queue_entry entry; struct mt76_queue *q; - bool wake; int i; for (i = 0; i < IEEE80211_NUM_ACS; i++) { @@ -829,10 +828,6 @@ static void mt76u_status_worker(struct mt76_worker *w) mt76_queue_tx_complete(dev, q, &entry); } - wake = q->stopped && q->queued < q->ndesc - 8; - if (wake) - q->stopped = false; - if (!q->queued) wake_up(&dev->tx_wait); @@ -841,8 +836,6 @@ static void mt76u_status_worker(struct mt76_worker *w) if (dev->drv->tx_status_data && !test_and_set_bit(MT76_READING_STATS, &dev->phy.state)) queue_work(dev->wq, &dev->usb.stat_work); - if (wake) - ieee80211_wake_queue(dev->hw, i); } } From 123bb2b737881127b450e8b3b1bae69a8949498e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 5 Dec 2020 11:35:26 +0100 Subject: [PATCH 088/547] mt76: sdio: remove wake logic in mt76s_process_tx_queue Similar to mmio/usb code path, remove wake logic in mt76s_process_tx_queue routine. Starting from commit 90d494c99a99 ("mt76: improve tx queue stop/wake"), the wake queue logic on the sdio status path is no longer necessary since the hw queues are no longer stopped on the mt76 tx path. Signed-off-by: Lorenzo Bianconi Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/d2d7d9d437f4dec2ef1df0ed070b9cf299f021ad.1607164041.git.lorenzo@kernel.org --- drivers/net/wireless/mediatek/mt76/sdio.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/sdio.c b/drivers/net/wireless/mediatek/mt76/sdio.c index 62b5b912818f..7cd995118257 100644 --- a/drivers/net/wireless/mediatek/mt76/sdio.c +++ b/drivers/net/wireless/mediatek/mt76/sdio.c @@ -157,7 +157,7 @@ static void mt76s_net_worker(struct mt76_worker *w) static int mt76s_process_tx_queue(struct mt76_dev *dev, struct mt76_queue *q) { - bool wake, mcu = q == dev->q_mcu[MT_MCUQ_WM]; + bool mcu = q == dev->q_mcu[MT_MCUQ_WM]; struct mt76_queue_entry entry; int nframes = 0; @@ -177,21 +177,12 @@ static int mt76s_process_tx_queue(struct mt76_dev *dev, struct mt76_queue *q) nframes++; } - wake = q->stopped && q->queued < q->ndesc - 8; - if (wake) - q->stopped = false; - if (!q->queued) wake_up(&dev->tx_wait); - if (mcu) - goto out; + if (!mcu) + mt76_txq_schedule(&dev->phy, q->qid); - mt76_txq_schedule(&dev->phy, q->qid); - - if (wake) - ieee80211_wake_queue(dev->hw, q->qid); -out: return nframes; } From f7217f718747641fc80cd062f183107439f2a066 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 8 Dec 2020 10:18:11 +0100 Subject: [PATCH 089/547] mt76: mt76s: fix NULL pointer dereference in mt76s_process_tx_queue Fix a possible NULL pointer dereference in mt76s_process_tx_queue that can occur if status thread runs before allocating tx queues Fixes: 6a618acb7e62 ("mt76: sdio: convert {status/net}_work to mt76_worker") Signed-off-by: Lorenzo Bianconi Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/b49c1b4edacd87b2241a9fd0431dd4864c8963f6.1607418933.git.lorenzo@kernel.org --- drivers/net/wireless/mediatek/mt76/sdio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/sdio.c b/drivers/net/wireless/mediatek/mt76/sdio.c index 7cd995118257..0b6facb17ff7 100644 --- a/drivers/net/wireless/mediatek/mt76/sdio.c +++ b/drivers/net/wireless/mediatek/mt76/sdio.c @@ -157,10 +157,14 @@ static void mt76s_net_worker(struct mt76_worker *w) static int mt76s_process_tx_queue(struct mt76_dev *dev, struct mt76_queue *q) { - bool mcu = q == dev->q_mcu[MT_MCUQ_WM]; struct mt76_queue_entry entry; int nframes = 0; + bool mcu; + if (!q) + return 0; + + mcu = q == dev->q_mcu[MT_MCUQ_WM]; while (q->queued > 0) { if (!q->entry[q->tail].done) break; From 0bd157fa2aaa2c77d6254321d7751aa9eec68c7b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 18 Dec 2020 09:32:02 -0800 Subject: [PATCH 090/547] mt76: mt7915: fix MESH ifdef block Fix a build error when CONFIG_MAC80211_MESH is not enabled: ../drivers/net/wireless/mediatek/mt76/mt7915/init.c:47:2: error: expected expression before '}' token }, { ^ Fixes: af901eb4ab80 ("mt76: mt7915: get rid of dbdc debugfs knob") Signed-off-by: Randy Dunlap Cc: Shayne Chen Cc: Ryder Lee Cc: Lorenzo Bianconi Cc: Felix Fietkau Cc: linux-wireless@vger.kernel.org Cc: Kalle Valo Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20201218173202.23159-1-rdunlap@infradead.org --- drivers/net/wireless/mediatek/mt76/mt7915/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/init.c b/drivers/net/wireless/mediatek/mt76/mt7915/init.c index ed4635bd151a..102a8f14c22d 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/init.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/init.c @@ -40,9 +40,9 @@ static const struct ieee80211_iface_limit if_limits[] = { .types = BIT(NL80211_IFTYPE_ADHOC) }, { .max = 16, - .types = BIT(NL80211_IFTYPE_AP) | + .types = BIT(NL80211_IFTYPE_AP) #ifdef CONFIG_MAC80211_MESH - BIT(NL80211_IFTYPE_MESH_POINT) + | BIT(NL80211_IFTYPE_MESH_POINT) #endif }, { .max = MT7915_MAX_INTERFACES, From bfe55584713b4d4d518ffe9cf2dab1129eba6321 Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Sun, 20 Dec 2020 15:18:07 +0100 Subject: [PATCH 091/547] MAINTAINERS: switch to different email address Switching to private mail account as work email is polluted with a legal disclaimer. Just making it extra clear by changing the email address in the MAINTAINERS file as well. Signed-off-by: Arend van Spriel Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20201220141807.17278-1-aspriel@gmail.com --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5c1a6ba5ef26..85a7c3a2ed63 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3545,7 +3545,7 @@ S: Supported F: drivers/net/ethernet/broadcom/bnxt/ BROADCOM BRCM80211 IEEE802.11n WIRELESS DRIVER -M: Arend van Spriel +M: Arend van Spriel M: Franky Lin M: Hante Meuleman M: Chi-hsien Lin From a590370d918fc66c62df6620445791fbe840344a Mon Sep 17 00:00:00 2001 From: Roman Guskov Date: Mon, 21 Dec 2020 13:35:32 +0100 Subject: [PATCH 092/547] spi: stm32: FIFO threshold level - fix align packet size if cur_bpw <= 8 and xfer_len < 4 then the value of fthlv will be 1 and SPI registers content may have been lost. * If SPI data register is accessed as a 16-bit register and DSIZE <= 8bit, better to select FTHLV = 2, 4, 6 etc * If SPI data register is accessed as a 32-bit register and DSIZE > 8bit, better to select FTHLV = 2, 4, 6 etc, while if DSIZE <= 8bit, better to select FTHLV = 4, 8, 12 etc Signed-off-by: Roman Guskov Fixes: dcbe0d84dfa5 ("spi: add driver for STM32 SPI controller") Reviewed-by: Marek Vasut Link: https://lore.kernel.org/r/20201221123532.27272-1-rguskov@dh-electronics.com Signed-off-by: Mark Brown --- drivers/spi/spi-stm32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-stm32.c b/drivers/spi/spi-stm32.c index 471dedf3d339..6017209c6d2f 100644 --- a/drivers/spi/spi-stm32.c +++ b/drivers/spi/spi-stm32.c @@ -493,9 +493,9 @@ static u32 stm32h7_spi_prepare_fthlv(struct stm32_spi *spi, u32 xfer_len) /* align packet size with data registers access */ if (spi->cur_bpw > 8) - fthlv -= (fthlv % 2); /* multiple of 2 */ + fthlv += (fthlv % 2) ? 1 : 0; else - fthlv -= (fthlv % 4); /* multiple of 4 */ + fthlv += (fthlv % 4) ? (4 - (fthlv % 4)) : 0; if (!fthlv) fthlv = 1; From 3b66e4a8e58a85af3212c7117d7a29c9ef6679a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guido=20G=C3=BCnther?= Date: Fri, 18 Dec 2020 19:38:07 +0100 Subject: [PATCH 093/547] regulator: bd718x7: Add enable times MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the typical startup times from the data sheet so boards get a reasonable default. Not setting any enable time can lead to board hangs when e.g. clocks are enabled too soon afterwards. This fixes gpu power domain resume on the Librem 5. [Moved #defines into driver, seems to be general agreement and avoids any cross tree issues -- broonie] Signed-off-by: Guido Günther Reviewed-by: Matti Vaittinen Link: https://lore.kernel.org/r/41fb2ed19f584f138336344e2297ae7301f72b75.1608316658.git.agx@sigxcpu.org Signed-off-by: Mark Brown --- drivers/regulator/bd718x7-regulator.c | 57 +++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/drivers/regulator/bd718x7-regulator.c b/drivers/regulator/bd718x7-regulator.c index e6d5d98c3cea..9309765d0450 100644 --- a/drivers/regulator/bd718x7-regulator.c +++ b/drivers/regulator/bd718x7-regulator.c @@ -15,6 +15,36 @@ #include #include +/* Typical regulator startup times as per data sheet in uS */ +#define BD71847_BUCK1_STARTUP_TIME 144 +#define BD71847_BUCK2_STARTUP_TIME 162 +#define BD71847_BUCK3_STARTUP_TIME 162 +#define BD71847_BUCK4_STARTUP_TIME 240 +#define BD71847_BUCK5_STARTUP_TIME 270 +#define BD71847_BUCK6_STARTUP_TIME 200 +#define BD71847_LDO1_STARTUP_TIME 440 +#define BD71847_LDO2_STARTUP_TIME 370 +#define BD71847_LDO3_STARTUP_TIME 310 +#define BD71847_LDO4_STARTUP_TIME 400 +#define BD71847_LDO5_STARTUP_TIME 530 +#define BD71847_LDO6_STARTUP_TIME 400 + +#define BD71837_BUCK1_STARTUP_TIME 160 +#define BD71837_BUCK2_STARTUP_TIME 180 +#define BD71837_BUCK3_STARTUP_TIME 180 +#define BD71837_BUCK4_STARTUP_TIME 180 +#define BD71837_BUCK5_STARTUP_TIME 160 +#define BD71837_BUCK6_STARTUP_TIME 240 +#define BD71837_BUCK7_STARTUP_TIME 220 +#define BD71837_BUCK8_STARTUP_TIME 200 +#define BD71837_LDO1_STARTUP_TIME 440 +#define BD71837_LDO2_STARTUP_TIME 370 +#define BD71837_LDO3_STARTUP_TIME 310 +#define BD71837_LDO4_STARTUP_TIME 400 +#define BD71837_LDO5_STARTUP_TIME 310 +#define BD71837_LDO6_STARTUP_TIME 400 +#define BD71837_LDO7_STARTUP_TIME 530 + /* * BD718(37/47/50) have two "enable control modes". ON/OFF can either be * controlled by software - or by PMIC internal HW state machine. Whether @@ -613,6 +643,7 @@ static struct bd718xx_regulator_data bd71847_regulators[] = { .vsel_mask = DVS_BUCK_RUN_MASK, .enable_reg = BD718XX_REG_BUCK1_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71847_BUCK1_STARTUP_TIME, .owner = THIS_MODULE, .of_parse_cb = buck_set_hw_dvs_levels, }, @@ -646,6 +677,7 @@ static struct bd718xx_regulator_data bd71847_regulators[] = { .vsel_mask = DVS_BUCK_RUN_MASK, .enable_reg = BD718XX_REG_BUCK2_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71847_BUCK2_STARTUP_TIME, .owner = THIS_MODULE, .of_parse_cb = buck_set_hw_dvs_levels, }, @@ -680,6 +712,7 @@ static struct bd718xx_regulator_data bd71847_regulators[] = { .linear_range_selectors = bd71847_buck3_volt_range_sel, .enable_reg = BD718XX_REG_1ST_NODVS_BUCK_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71847_BUCK3_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -706,6 +739,7 @@ static struct bd718xx_regulator_data bd71847_regulators[] = { .vsel_range_mask = BD71847_BUCK4_RANGE_MASK, .linear_range_selectors = bd71847_buck4_volt_range_sel, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71847_BUCK4_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -727,6 +761,7 @@ static struct bd718xx_regulator_data bd71847_regulators[] = { .vsel_mask = BD718XX_3RD_NODVS_BUCK_MASK, .enable_reg = BD718XX_REG_3RD_NODVS_BUCK_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71847_BUCK5_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -750,6 +785,7 @@ static struct bd718xx_regulator_data bd71847_regulators[] = { .vsel_mask = BD718XX_4TH_NODVS_BUCK_MASK, .enable_reg = BD718XX_REG_4TH_NODVS_BUCK_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71847_BUCK6_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -775,6 +811,7 @@ static struct bd718xx_regulator_data bd71847_regulators[] = { .linear_range_selectors = bd718xx_ldo1_volt_range_sel, .enable_reg = BD718XX_REG_LDO1_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71847_LDO1_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -796,6 +833,7 @@ static struct bd718xx_regulator_data bd71847_regulators[] = { .n_voltages = ARRAY_SIZE(ldo_2_volts), .enable_reg = BD718XX_REG_LDO2_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71847_LDO2_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -818,6 +856,7 @@ static struct bd718xx_regulator_data bd71847_regulators[] = { .vsel_mask = BD718XX_LDO3_MASK, .enable_reg = BD718XX_REG_LDO3_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71847_LDO3_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -840,6 +879,7 @@ static struct bd718xx_regulator_data bd71847_regulators[] = { .vsel_mask = BD718XX_LDO4_MASK, .enable_reg = BD718XX_REG_LDO4_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71847_LDO4_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -865,6 +905,7 @@ static struct bd718xx_regulator_data bd71847_regulators[] = { .linear_range_selectors = bd71847_ldo5_volt_range_sel, .enable_reg = BD718XX_REG_LDO5_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71847_LDO5_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -889,6 +930,7 @@ static struct bd718xx_regulator_data bd71847_regulators[] = { .vsel_mask = BD718XX_LDO6_MASK, .enable_reg = BD718XX_REG_LDO6_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71847_LDO6_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -942,6 +984,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .vsel_mask = DVS_BUCK_RUN_MASK, .enable_reg = BD718XX_REG_BUCK1_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71837_BUCK1_STARTUP_TIME, .owner = THIS_MODULE, .of_parse_cb = buck_set_hw_dvs_levels, }, @@ -975,6 +1018,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .vsel_mask = DVS_BUCK_RUN_MASK, .enable_reg = BD718XX_REG_BUCK2_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71837_BUCK2_STARTUP_TIME, .owner = THIS_MODULE, .of_parse_cb = buck_set_hw_dvs_levels, }, @@ -1005,6 +1049,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .vsel_mask = DVS_BUCK_RUN_MASK, .enable_reg = BD71837_REG_BUCK3_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71837_BUCK3_STARTUP_TIME, .owner = THIS_MODULE, .of_parse_cb = buck_set_hw_dvs_levels, }, @@ -1033,6 +1078,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .vsel_mask = DVS_BUCK_RUN_MASK, .enable_reg = BD71837_REG_BUCK4_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71837_BUCK4_STARTUP_TIME, .owner = THIS_MODULE, .of_parse_cb = buck_set_hw_dvs_levels, }, @@ -1065,6 +1111,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .linear_range_selectors = bd71837_buck5_volt_range_sel, .enable_reg = BD718XX_REG_1ST_NODVS_BUCK_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71837_BUCK5_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -1088,6 +1135,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .vsel_mask = BD71837_BUCK6_MASK, .enable_reg = BD718XX_REG_2ND_NODVS_BUCK_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71837_BUCK6_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -1109,6 +1157,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .vsel_mask = BD718XX_3RD_NODVS_BUCK_MASK, .enable_reg = BD718XX_REG_3RD_NODVS_BUCK_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71837_BUCK7_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -1132,6 +1181,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .vsel_mask = BD718XX_4TH_NODVS_BUCK_MASK, .enable_reg = BD718XX_REG_4TH_NODVS_BUCK_CTRL, .enable_mask = BD718XX_BUCK_EN, + .enable_time = BD71837_BUCK8_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -1157,6 +1207,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .linear_range_selectors = bd718xx_ldo1_volt_range_sel, .enable_reg = BD718XX_REG_LDO1_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71837_LDO1_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -1178,6 +1229,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .n_voltages = ARRAY_SIZE(ldo_2_volts), .enable_reg = BD718XX_REG_LDO2_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71837_LDO2_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -1200,6 +1252,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .vsel_mask = BD718XX_LDO3_MASK, .enable_reg = BD718XX_REG_LDO3_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71837_LDO3_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -1222,6 +1275,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .vsel_mask = BD718XX_LDO4_MASK, .enable_reg = BD718XX_REG_LDO4_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71837_LDO4_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -1246,6 +1300,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .vsel_mask = BD71837_LDO5_MASK, .enable_reg = BD718XX_REG_LDO5_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71837_LDO5_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -1272,6 +1327,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .vsel_mask = BD718XX_LDO6_MASK, .enable_reg = BD718XX_REG_LDO6_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71837_LDO6_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { @@ -1296,6 +1352,7 @@ static struct bd718xx_regulator_data bd71837_regulators[] = { .vsel_mask = BD71837_LDO7_MASK, .enable_reg = BD71837_REG_LDO7_VOLT, .enable_mask = BD718XX_LDO_EN, + .enable_time = BD71837_LDO7_STARTUP_TIME, .owner = THIS_MODULE, }, .init = { From e7e518053c267bb6be3799520d9f4a34c7264a2e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Dec 2020 11:25:06 -0800 Subject: [PATCH 094/547] bpf: Add schedule point in htab_init_buckets() We noticed that with a LOCKDEP enabled kernel, allocating a hash table with 65536 buckets would use more than 60ms. htab_init_buckets() runs from process context, it is safe to schedule to avoid latency spikes. Fixes: c50eb518e262 ("bpf: Use separate lockdep class for each hashtab") Reported-by: John Sperbeck Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201221192506.707584-1-eric.dumazet@gmail.com --- kernel/bpf/hashtab.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 7e848200cd26..c1ac7f964bc9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -152,6 +152,7 @@ static void htab_init_buckets(struct bpf_htab *htab) lockdep_set_class(&htab->buckets[i].lock, &htab->lockdep_key); } + cond_resched(); } } From 54ddbdb024882e226055cc4c3c246592ddde2ee5 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 18 Dec 2020 09:38:43 -0800 Subject: [PATCH 095/547] net: systemport: set dev->max_mtu to UMAC_MAX_MTU_SIZE The driver is already allocating receive buffers of 2KiB and the Ethernet MAC is configured to accept frames up to UMAC_MAX_MTU_SIZE. Fixes: bfcb813203e6 ("net: dsa: configure the MTU for switch ports") Signed-off-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20201218173843.141046-1-f.fainelli@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcmsysport.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 0fdd19d99d99..b1ae9eb8f247 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2577,6 +2577,7 @@ static int bcm_sysport_probe(struct platform_device *pdev) NETIF_F_HW_VLAN_CTAG_TX; dev->hw_features |= dev->features; dev->vlan_features |= dev->features; + dev->max_mtu = UMAC_MAX_MTU_SIZE; /* Request the WOL interrupt and advertise suspend if available */ priv->wol_irq_disabled = 1; From 1385ae5c30f238f81bc6528d897c6d7a0816783f Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 18 Dec 2020 11:55:36 +0100 Subject: [PATCH 096/547] ethernet: ucc_geth: set dev->max_mtu to 1518 All the buffers and registers are already set up appropriately for an MTU slightly above 1500, so we just need to expose this to the networking stack. AFAICT, there's no need to implement .ndo_change_mtu when the receive buffers are always set up to support the max_mtu. This fixes several warnings during boot on our mpc8309-board with an embedded mv88e6250 switch: mv88e6085 mdio@e0102120:10: nonfatal error -34 setting MTU 1500 on port 0 ... mv88e6085 mdio@e0102120:10: nonfatal error -34 setting MTU 1500 on port 4 ucc_geth e0102000.ethernet eth1: error -22 setting MTU to 1504 to include DSA overhead The last line explains what the DSA stack tries to do: achieving an MTU of 1500 on-the-wire requires that the master netdevice connected to the CPU port supports an MTU of 1500+the tagging overhead. Fixes: bfcb813203e6 ("net: dsa: configure the MTU for switch ports") Reviewed-by: Andrew Lunn Signed-off-by: Rasmus Villemoes Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/ucc_geth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index ba8869c3d891..d6dbae480716 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -3889,6 +3889,7 @@ static int ucc_geth_probe(struct platform_device* ofdev) INIT_WORK(&ugeth->timeout_work, ucc_geth_timeout_work); netif_napi_add(dev, &ugeth->napi, ucc_geth_poll, 64); dev->mtu = 1500; + dev->max_mtu = 1518; ugeth->msg_enable = netif_msg_init(debug.msg_enable, UGETH_MSG_DEFAULT); ugeth->phy_interface = phy_interface; From 887078de2a23689e29d6fa1b75d7cbc544c280be Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 18 Dec 2020 11:55:37 +0100 Subject: [PATCH 097/547] ethernet: ucc_geth: fix definition and size of ucc_geth_tx_global_pram Table 8-53 in the QUICC Engine Reference manual shows definitions of fields up to a size of 192 bytes, not just 128. But in table 8-111, one does find the text Base Address of the Global Transmitter Parameter RAM Page. [...] The user needs to allocate 128 bytes for this page. The address must be aligned to the page size. I've checked both rev. 7 (11/2015) and rev. 9 (05/2018) of the manual; they both have this inconsistency (and the table numbers are the same). Adding a bit of debug printing, on my board the struct ucc_geth_tx_global_pram is allocated at offset 0x880, while the (opaque) ucc_geth_thread_data_tx gets allocated immediately afterwards, at 0x900. So whatever the engine writes into the thread data overlaps with the tail of the global tx pram (and devmem says that something does get written during a simple ping). I haven't observed any failure that could be attributed to this, but it seems to be the kind of thing that would be extremely hard to debug. So extend the struct definition so that we do allocate 192 bytes. Signed-off-by: Rasmus Villemoes Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/ucc_geth.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.h b/drivers/net/ethernet/freescale/ucc_geth.h index 1a9bdf66a7d8..11d4bf5dc21f 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.h +++ b/drivers/net/ethernet/freescale/ucc_geth.h @@ -575,7 +575,14 @@ struct ucc_geth_tx_global_pram { u32 vtagtable[0x8]; /* 8 4-byte VLAN tags */ u32 tqptr; /* a base pointer to the Tx Queues Memory Region */ - u8 res2[0x80 - 0x74]; + u8 res2[0x78 - 0x74]; + u64 snums_en; + u32 l2l3baseptr; /* top byte consists of a few other bit fields */ + + u16 mtu[8]; + u8 res3[0xa8 - 0x94]; + u32 wrrtablebase; /* top byte is reserved */ + u8 res4[0xc0 - 0xac]; } __packed; /* structure representing Extended Filtering Global Parameters in PRAM */ From e925e0cd2a705aaacb0b907bb3691fcac3a973a4 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 18 Dec 2020 11:55:38 +0100 Subject: [PATCH 098/547] ethernet: ucc_geth: fix use-after-free in ucc_geth_remove() ugeth is the netdiv_priv() part of the netdevice. Accessing the memory pointed to by ugeth (such as done by ucc_geth_memclean() and the two of_node_puts) after free_netdev() is thus use-after-free. Fixes: 80a9fad8e89a ("ucc_geth: fix module removal") Signed-off-by: Rasmus Villemoes Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/ucc_geth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index d6dbae480716..6d853f018d53 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -3935,12 +3935,12 @@ static int ucc_geth_remove(struct platform_device* ofdev) struct device_node *np = ofdev->dev.of_node; unregister_netdev(dev); - free_netdev(dev); ucc_geth_memclean(ugeth); if (of_phy_is_fixed_link(np)) of_phy_deregister_fixed_link(np); of_node_put(ugeth->ug_info->tbi_node); of_node_put(ugeth->ug_info->phy_node); + free_netdev(dev); return 0; } From 83469893204281ecf65d572bddf02de29a19787c Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 18 Dec 2020 13:50:01 -0800 Subject: [PATCH 099/547] ionic: account for vlan tag len in rx buffer len Let the FW know we have enough receive buffer space for the vlan tag if it isn't stripped. Fixes: 0f3154e6bcb3 ("ionic: Add Tx and Rx handling") Signed-off-by: Shannon Nelson Link: https://lore.kernel.org/r/20201218215001.64696-1-snelson@pensando.io Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_txrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index 9156c9825a16..ac4cd5d82e69 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -337,7 +337,7 @@ void ionic_rx_fill(struct ionic_queue *q) unsigned int i, j; unsigned int len; - len = netdev->mtu + ETH_HLEN; + len = netdev->mtu + ETH_HLEN + VLAN_HLEN; nfrags = round_up(len, PAGE_SIZE) / PAGE_SIZE; for (i = ionic_q_space_avail(q); i; i--) { From 8df66af5c1e5f80562fe728db5ec069b21810144 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 19 Dec 2020 14:01:44 +0300 Subject: [PATCH 100/547] atm: idt77252: call pci_disable_device() on error path This error path needs to disable the pci device before returning. Fixes: ede58ef28e10 ("atm: remove deprecated use of pci api") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/X93dmC4NX0vbTpGp@mwanda Signed-off-by: Jakub Kicinski --- drivers/atm/idt77252.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index 65a3886f68c9..5f0472c18bcb 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -3607,7 +3607,7 @@ static int idt77252_init_one(struct pci_dev *pcidev, if ((err = dma_set_mask_and_coherent(&pcidev->dev, DMA_BIT_MASK(32)))) { printk("idt77252: can't enable DMA for PCI device at %s\n", pci_name(pcidev)); - return err; + goto err_out_disable_pdev; } card = kzalloc(sizeof(struct idt77252_dev), GFP_KERNEL); From bcce55f556e824d43f352d76b94509185585e38d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 19 Dec 2020 13:19:24 +0100 Subject: [PATCH 101/547] ppp: Fix PPPIOCUNBRIDGECHAN request number PPPIOCGL2TPSTATS already uses 54. This shouldn't be a problem in practice, but let's keep the logical decreasing assignment scheme. Fixes: 4cf476ced45d ("ppp: add PPPIOCBRIDGECHAN and PPPIOCUNBRIDGECHAN ioctls") Signed-off-by: Guillaume Nault Link: https://lore.kernel.org/r/e3a4c355e3820331d8e1fffef8522739aae58b57.1608380117.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ppp-ioctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/ppp-ioctl.h b/include/uapi/linux/ppp-ioctl.h index 8dbecb3ad036..1cc5ce0ae062 100644 --- a/include/uapi/linux/ppp-ioctl.h +++ b/include/uapi/linux/ppp-ioctl.h @@ -116,7 +116,7 @@ struct pppol2tp_ioc_stats { #define PPPIOCGCHAN _IOR('t', 55, int) /* get ppp channel number */ #define PPPIOCGL2TPSTATS _IOR('t', 54, struct pppol2tp_ioc_stats) #define PPPIOCBRIDGECHAN _IOW('t', 53, int) /* bridge one channel to another */ -#define PPPIOCUNBRIDGECHAN _IO('t', 54) /* unbridge channel */ +#define PPPIOCUNBRIDGECHAN _IO('t', 52) /* unbridge channel */ #define SIOCGPPPSTATS (SIOCDEVPRIVATE + 0) #define SIOCGPPPVER (SIOCDEVPRIVATE + 1) /* NEVER change this!! */ From 2575bc1aa9d52a62342b57a0b7d0a12146cf6aed Mon Sep 17 00:00:00 2001 From: Stefan Chulski Date: Sun, 20 Dec 2020 13:02:29 +0200 Subject: [PATCH 102/547] net: mvpp2: Fix GoP port 3 Networking Complex Control configurations During GoP port 2 Networking Complex Control mode of operation configurations, also GoP port 3 mode of operation was wrongly set. Patch removes these configurations. Fixes: f84bf386f395 ("net: mvpp2: initialize the GoP") Acked-by: Marcin Wojtas Signed-off-by: Stefan Chulski Link: https://lore.kernel.org/r/1608462149-1702-1-git-send-email-stefanc@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 0f18d034ce3e..f20b31327027 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -1231,7 +1231,7 @@ static void mvpp22_gop_init_rgmii(struct mvpp2_port *port) regmap_read(priv->sysctrl_base, GENCONF_CTRL0, &val); if (port->gop_id == 2) - val |= GENCONF_CTRL0_PORT0_RGMII | GENCONF_CTRL0_PORT1_RGMII; + val |= GENCONF_CTRL0_PORT0_RGMII; else if (port->gop_id == 3) val |= GENCONF_CTRL0_PORT1_RGMII_MII; regmap_write(priv->sysctrl_base, GENCONF_CTRL0, val); From 74a2921948ed8c0e7f079a98442ec3493168cc85 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 2 Dec 2020 18:36:57 +0800 Subject: [PATCH 103/547] scsi: hisi_sas: Expose HW queues for v2 hw As a performance enhancement, make the completion queue interrupts managed. In addition, in commit bf0beec0607d ("blk-mq: drain I/O when all CPUs in a hctx are offline"), CPU hotplug for MQ devices using managed interrupts is made safe. So expose HW queues to blk-mq to take advantage of this. Flag Scsi_host.host_tagset is also set to ensure that the HBA is not sent more commands than it can handle. However the driver still does not use request tag for IPTT as there are many HW bugs means that special rules apply for IPTT allocation. Link: https://lore.kernel.org/r/1606905417-183214-6-git-send-email-john.garry@huawei.com Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas.h | 4 ++ drivers/scsi/hisi_sas/hisi_sas_main.c | 11 +++++ drivers/scsi/hisi_sas/hisi_sas_v2_hw.c | 66 +++++++++++++++++++++----- 3 files changed, 68 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h index a25cfc11c96d..aa67807c5693 100644 --- a/drivers/scsi/hisi_sas/hisi_sas.h +++ b/drivers/scsi/hisi_sas/hisi_sas.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -312,6 +313,7 @@ enum { struct hisi_sas_hw { int (*hw_init)(struct hisi_hba *hisi_hba); + int (*interrupt_preinit)(struct hisi_hba *hisi_hba); void (*setup_itct)(struct hisi_hba *hisi_hba, struct hisi_sas_device *device); int (*slot_index_alloc)(struct hisi_hba *hisi_hba, @@ -418,6 +420,8 @@ struct hisi_hba { u32 refclk_frequency_mhz; u8 sas_addr[SAS_ADDR_SIZE]; + int *irq_map; /* v2 hw */ + int n_phy; spinlock_t lock; struct semaphore sem; diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 274ccf18ce2d..061d65b8a287 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -2620,6 +2620,13 @@ err_out: return NULL; } +static int hisi_sas_interrupt_preinit(struct hisi_hba *hisi_hba) +{ + if (hisi_hba->hw->interrupt_preinit) + return hisi_hba->hw->interrupt_preinit(hisi_hba); + return 0; +} + int hisi_sas_probe(struct platform_device *pdev, const struct hisi_sas_hw *hw) { @@ -2677,6 +2684,10 @@ int hisi_sas_probe(struct platform_device *pdev, sha->sas_port[i] = &hisi_hba->port[i].sas_port; } + rc = hisi_sas_interrupt_preinit(hisi_hba); + if (rc) + goto err_out_ha; + rc = scsi_add_host(shost, &pdev->dev); if (rc) goto err_out_ha; diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c index b57177b52fac..9adfdefef9ca 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c @@ -3302,6 +3302,28 @@ static irq_handler_t fatal_interrupts[HISI_SAS_FATAL_INT_NR] = { fatal_axi_int_v2_hw }; +#define CQ0_IRQ_INDEX (96) + +static int hisi_sas_v2_interrupt_preinit(struct hisi_hba *hisi_hba) +{ + struct platform_device *pdev = hisi_hba->platform_dev; + struct Scsi_Host *shost = hisi_hba->shost; + struct irq_affinity desc = { + .pre_vectors = CQ0_IRQ_INDEX, + .post_vectors = 16, + }; + int resv = desc.pre_vectors + desc.post_vectors, minvec = resv + 1, nvec; + + nvec = devm_platform_get_irqs_affinity(pdev, &desc, minvec, 128, + &hisi_hba->irq_map); + if (nvec < 0) + return nvec; + + shost->nr_hw_queues = hisi_hba->cq_nvecs = nvec - resv; + + return 0; +} + /* * There is a limitation in the hip06 chipset that we need * to map in all mbigen interrupts, even if they are not used. @@ -3310,14 +3332,11 @@ static int interrupt_init_v2_hw(struct hisi_hba *hisi_hba) { struct platform_device *pdev = hisi_hba->platform_dev; struct device *dev = &pdev->dev; - int irq, rc = 0, irq_map[128]; + int irq, rc = 0; int i, phy_no, fatal_no, queue_no; - for (i = 0; i < 128; i++) - irq_map[i] = platform_get_irq(pdev, i); - for (i = 0; i < HISI_SAS_PHY_INT_NR; i++) { - irq = irq_map[i + 1]; /* Phy up/down is irq1 */ + irq = hisi_hba->irq_map[i + 1]; /* Phy up/down is irq1 */ rc = devm_request_irq(dev, irq, phy_interrupts[i], 0, DRV_NAME " phy", hisi_hba); if (rc) { @@ -3331,7 +3350,7 @@ static int interrupt_init_v2_hw(struct hisi_hba *hisi_hba) for (phy_no = 0; phy_no < hisi_hba->n_phy; phy_no++) { struct hisi_sas_phy *phy = &hisi_hba->phy[phy_no]; - irq = irq_map[phy_no + 72]; + irq = hisi_hba->irq_map[phy_no + 72]; rc = devm_request_irq(dev, irq, sata_int_v2_hw, 0, DRV_NAME " sata", phy); if (rc) { @@ -3343,7 +3362,7 @@ static int interrupt_init_v2_hw(struct hisi_hba *hisi_hba) } for (fatal_no = 0; fatal_no < HISI_SAS_FATAL_INT_NR; fatal_no++) { - irq = irq_map[fatal_no + 81]; + irq = hisi_hba->irq_map[fatal_no + 81]; rc = devm_request_irq(dev, irq, fatal_interrupts[fatal_no], 0, DRV_NAME " fatal", hisi_hba); if (rc) { @@ -3354,24 +3373,22 @@ static int interrupt_init_v2_hw(struct hisi_hba *hisi_hba) } } - for (queue_no = 0; queue_no < hisi_hba->queue_count; queue_no++) { + for (queue_no = 0; queue_no < hisi_hba->cq_nvecs; queue_no++) { struct hisi_sas_cq *cq = &hisi_hba->cq[queue_no]; - cq->irq_no = irq_map[queue_no + 96]; + cq->irq_no = hisi_hba->irq_map[queue_no + 96]; rc = devm_request_threaded_irq(dev, cq->irq_no, cq_interrupt_v2_hw, cq_thread_v2_hw, IRQF_ONESHOT, DRV_NAME " cq", cq); if (rc) { dev_err(dev, "irq init: could not request cq interrupt %d, rc=%d\n", - irq, rc); + cq->irq_no, rc); rc = -ENOENT; goto err_out; } + cq->irq_mask = irq_get_affinity_mask(cq->irq_no); } - - hisi_hba->cq_nvecs = hisi_hba->queue_count; - err_out: return rc; } @@ -3529,6 +3546,26 @@ static struct device_attribute *host_attrs_v2_hw[] = { NULL }; +static int map_queues_v2_hw(struct Scsi_Host *shost) +{ + struct hisi_hba *hisi_hba = shost_priv(shost); + struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; + const struct cpumask *mask; + unsigned int queue, cpu; + + for (queue = 0; queue < qmap->nr_queues; queue++) { + mask = irq_get_affinity_mask(hisi_hba->irq_map[96 + queue]); + if (!mask) + continue; + + for_each_cpu(cpu, mask) + qmap->mq_map[cpu] = qmap->queue_offset + queue; + } + + return 0; + +} + static struct scsi_host_template sht_v2_hw = { .name = DRV_NAME, .proc_name = DRV_NAME, @@ -3553,10 +3590,13 @@ static struct scsi_host_template sht_v2_hw = { #endif .shost_attrs = host_attrs_v2_hw, .host_reset = hisi_sas_host_reset, + .map_queues = map_queues_v2_hw, + .host_tagset = 1, }; static const struct hisi_sas_hw hisi_sas_v2_hw = { .hw_init = hisi_sas_v2_init, + .interrupt_preinit = hisi_sas_v2_interrupt_preinit, .setup_itct = setup_itct_v2_hw, .slot_index_alloc = slot_index_alloc_quirk_v2_hw, .alloc_dev = alloc_dev_quirk_v2_hw, From 2a5f1b67ec577fb1544b563086e0377f095f88e2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 10 Dec 2020 08:30:59 +0000 Subject: [PATCH 104/547] KVM: arm64: Don't access PMCR_EL0 when no PMU is available We reset the guest's view of PMCR_EL0 unconditionally, based on the host's view of this register. It is however legal for an implementation not to provide any PMU, resulting in an UNDEF. The obvious fix is to skip the reset of this shadow register when no PMU is available, sidestepping the issue entirely. If no PMU is available, the guest is not able to request a virtual PMU anyway, so not doing nothing is the right thing to do! It is unlikely that this bug can hit any HW implementation though, as they all provide a PMU. It has been found using nested virt with the host KVM not implementing the PMU itself. Fixes: ab9468340d2bc ("arm64: KVM: Add access handler for PMCR register") Reviewed-by: Alexandru Elisei Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201210083059.1277162-1-maz@kernel.org --- arch/arm64/kvm/sys_regs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 3313dedfa505..d46e7f706cb0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -594,6 +594,10 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 pmcr, val; + /* No PMU available, PMCR_EL0 may UNDEF... */ + if (!kvm_arm_support_pmu_v3()) + return; + pmcr = read_sysreg(pmcr_el0); /* * Writable bits of PMCR_EL0 (ARMV8_PMU_PMCR_MASK) are reset to UNKNOWN From ff367fe473a9857160c17827931375a899076394 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 8 Dec 2020 14:24:47 +0000 Subject: [PATCH 105/547] KVM: arm64: Prevent use of invalid PSCI v0.1 function IDs PSCI driver exposes a struct containing the PSCI v0.1 function IDs configured in the DT. However, the struct does not convey the information whether these were set from DT or contain the default value zero. This could be a problem for PSCI proxy in KVM protected mode. Extend config passed to KVM with a bit mask with individual bits set depending on whether the corresponding function pointer in psci_ops is set, eg. set bit for PSCI_CPU_SUSPEND if psci_ops.cpu_suspend != NULL. Previously config was split into multiple global variables. Put everything into a single struct for convenience. Reported-by: Mark Rutland Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201208142452.87237-2-dbrazdil@google.com --- arch/arm64/include/asm/kvm_host.h | 20 +++++++++++ arch/arm64/kvm/arm.c | 14 +++++--- arch/arm64/kvm/hyp/nvhe/psci-relay.c | 53 +++++++++++++++++++++------- 3 files changed, 70 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 11beda85ee7e..828d50d40dc2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -240,6 +241,25 @@ struct kvm_host_data { struct kvm_pmu_events pmu_events; }; +#define KVM_HOST_PSCI_0_1_CPU_SUSPEND BIT(0) +#define KVM_HOST_PSCI_0_1_CPU_ON BIT(1) +#define KVM_HOST_PSCI_0_1_CPU_OFF BIT(2) +#define KVM_HOST_PSCI_0_1_MIGRATE BIT(3) + +struct kvm_host_psci_config { + /* PSCI version used by host. */ + u32 version; + + /* Function IDs used by host if version is v0.1. */ + struct psci_0_1_function_ids function_ids_0_1; + + /* Bitmask of functions enabled for v0.1, bits KVM_HOST_PSCI_0_1_*. */ + unsigned int enabled_functions_0_1; +}; + +extern struct kvm_host_psci_config kvm_nvhe_sym(kvm_host_psci_config); +#define kvm_host_psci_config CHOOSE_NVHE_SYM(kvm_host_psci_config) + struct vcpu_reset_state { unsigned long pc; unsigned long r0; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 6e637d2b4cfb..6a2f4e01b04f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -66,8 +66,6 @@ static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); extern u64 kvm_nvhe_sym(__cpu_logical_map)[NR_CPUS]; -extern u32 kvm_nvhe_sym(kvm_host_psci_version); -extern struct psci_0_1_function_ids kvm_nvhe_sym(kvm_host_psci_0_1_function_ids); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { @@ -1618,8 +1616,16 @@ static bool init_psci_relay(void) return false; } - kvm_nvhe_sym(kvm_host_psci_version) = psci_ops.get_version(); - kvm_nvhe_sym(kvm_host_psci_0_1_function_ids) = get_psci_0_1_function_ids(); + kvm_host_psci_config.version = psci_ops.get_version(); + + if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) { + kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids(); + kvm_host_psci_config.enabled_functions_0_1 = + (psci_ops.cpu_suspend ? KVM_HOST_PSCI_0_1_CPU_SUSPEND : 0) | + (psci_ops.cpu_off ? KVM_HOST_PSCI_0_1_CPU_OFF : 0) | + (psci_ops.cpu_on ? KVM_HOST_PSCI_0_1_CPU_ON : 0) | + (psci_ops.migrate ? KVM_HOST_PSCI_0_1_MIGRATE : 0); + } return true; } diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 08dc9de69314..0d6f4aa39621 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -22,9 +22,8 @@ void kvm_hyp_cpu_resume(unsigned long r0); void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); /* Config options set by the host. */ -__ro_after_init u32 kvm_host_psci_version; -__ro_after_init struct psci_0_1_function_ids kvm_host_psci_0_1_function_ids; -__ro_after_init s64 hyp_physvirt_offset; +struct kvm_host_psci_config __ro_after_init kvm_host_psci_config; +s64 __ro_after_init hyp_physvirt_offset; #define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset) @@ -54,12 +53,41 @@ static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt) return func_id; } +static inline bool is_psci_0_1_function_enabled(unsigned int fn_bit) +{ + return kvm_host_psci_config.enabled_functions_0_1 & fn_bit; +} + +static inline bool is_psci_0_1_cpu_suspend(u64 func_id) +{ + return is_psci_0_1_function_enabled(KVM_HOST_PSCI_0_1_CPU_SUSPEND) && + (func_id == kvm_host_psci_config.function_ids_0_1.cpu_suspend); +} + +static inline bool is_psci_0_1_cpu_on(u64 func_id) +{ + return is_psci_0_1_function_enabled(KVM_HOST_PSCI_0_1_CPU_ON) && + (func_id == kvm_host_psci_config.function_ids_0_1.cpu_on); +} + +static inline bool is_psci_0_1_cpu_off(u64 func_id) +{ + return is_psci_0_1_function_enabled(KVM_HOST_PSCI_0_1_CPU_OFF) && + (func_id == kvm_host_psci_config.function_ids_0_1.cpu_off); +} + +static inline bool is_psci_0_1_migrate(u64 func_id) +{ + return is_psci_0_1_function_enabled(KVM_HOST_PSCI_0_1_MIGRATE) && + (func_id == kvm_host_psci_config.function_ids_0_1.migrate); +} + static bool is_psci_0_1_call(u64 func_id) { - return (func_id == kvm_host_psci_0_1_function_ids.cpu_suspend) || - (func_id == kvm_host_psci_0_1_function_ids.cpu_on) || - (func_id == kvm_host_psci_0_1_function_ids.cpu_off) || - (func_id == kvm_host_psci_0_1_function_ids.migrate); + return is_psci_0_1_cpu_suspend(func_id) || + is_psci_0_1_cpu_on(func_id) || + is_psci_0_1_cpu_off(func_id) || + is_psci_0_1_migrate(func_id); } static bool is_psci_0_2_call(u64 func_id) @@ -71,7 +99,7 @@ static bool is_psci_0_2_call(u64 func_id) static bool is_psci_call(u64 func_id) { - switch (kvm_host_psci_version) { + switch (kvm_host_psci_config.version) { case PSCI_VERSION(0, 1): return is_psci_0_1_call(func_id); default: @@ -248,12 +276,11 @@ asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on) static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) { - if ((func_id == kvm_host_psci_0_1_function_ids.cpu_off) || - (func_id == kvm_host_psci_0_1_function_ids.migrate)) + if (is_psci_0_1_cpu_off(func_id) || is_psci_0_1_migrate(func_id)) return psci_forward(host_ctxt); - else if (func_id == kvm_host_psci_0_1_function_ids.cpu_on) + else if (is_psci_0_1_cpu_on(func_id)) return psci_cpu_on(func_id, host_ctxt); - else if (func_id == kvm_host_psci_0_1_function_ids.cpu_suspend) + else if (is_psci_0_1_cpu_suspend(func_id)) return psci_cpu_suspend(func_id, host_ctxt); else return PSCI_RET_NOT_SUPPORTED; @@ -304,7 +331,7 @@ bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt) if (!is_psci_call(func_id)) return false; - switch (kvm_host_psci_version) { + switch (kvm_host_psci_config.version) { case PSCI_VERSION(0, 1): ret = psci_0_1_handler(func_id, host_ctxt); break; From 7a96a0687b80a1870c689418d7b72012c8bdd53d Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 8 Dec 2020 14:24:48 +0000 Subject: [PATCH 106/547] KVM: arm64: Use lm_alias in nVHE-only VA conversion init_hyp_physvirt_offset() computes PA from a kernel VA. Conversion to kernel linear-map is required first but the code used kvm_ksym_ref() for this purpose. Under VHE that is a NOP and resulted in a runtime warning. Replace kvm_ksym_ref with lm_alias. Reported-by: Qian Cai Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201208142452.87237-3-dbrazdil@google.com --- arch/arm64/kvm/va_layout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index d8cc51bd60bf..914732b88c69 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -42,7 +42,7 @@ static void init_hyp_physvirt_offset(void) u64 kern_va, hyp_va; /* Compute the offset from the hyp VA and PA of a random symbol. */ - kern_va = (u64)kvm_ksym_ref(__hyp_text_start); + kern_va = (u64)lm_alias(__hyp_text_start); hyp_va = __early_kern_hyp_va(kern_va); CHOOSE_NVHE_SYM(hyp_physvirt_offset) = (s64)__pa(kern_va) - (s64)hyp_va; } From c3e181aec96f6ada84df1cb72a72be8970f8b284 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 8 Dec 2020 14:24:49 +0000 Subject: [PATCH 107/547] KVM: arm64: Skip computing hyp VA layout for VHE Computing the hyp VA layout is redundant when the kernel runs in EL2 and hyp shares its VA mappings. Make calling kvm_compute_layout() conditional on not just CONFIG_KVM but also !is_kernel_in_hyp_mode(). Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201208142452.87237-4-dbrazdil@google.com --- arch/arm64/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 18e9727d3f64..4e585cc892e8 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -434,7 +434,7 @@ static void __init hyp_mode_check(void) "CPU: CPUs started in inconsistent modes"); else pr_info("CPU: All CPU(s) started at EL1\n"); - if (IS_ENABLED(CONFIG_KVM)) + if (IS_ENABLED(CONFIG_KVM) && !is_kernel_in_hyp_mode()) kvm_compute_layout(); } From 61fe0c37af57ac35472a870581a7d0bb5ac2f63a Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 8 Dec 2020 14:24:50 +0000 Subject: [PATCH 108/547] KVM: arm64: Minor cleanup of hyp variables used in host Small cleanup moving declarations of hyp-exported variables to kvm_host.h and using macros to avoid having to refer to them with kvm_nvhe_sym() in host. No functional change intended. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201208142452.87237-5-dbrazdil@google.com --- arch/arm64/include/asm/kvm_host.h | 6 ++++++ arch/arm64/kvm/arm.c | 4 +--- arch/arm64/kvm/hyp/nvhe/hyp-smp.c | 6 +++--- arch/arm64/kvm/va_layout.c | 5 ++--- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 828d50d40dc2..bce2452b305c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -260,6 +260,12 @@ struct kvm_host_psci_config { extern struct kvm_host_psci_config kvm_nvhe_sym(kvm_host_psci_config); #define kvm_host_psci_config CHOOSE_NVHE_SYM(kvm_host_psci_config) +extern s64 kvm_nvhe_sym(hyp_physvirt_offset); +#define hyp_physvirt_offset CHOOSE_NVHE_SYM(hyp_physvirt_offset) + +extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS]; +#define hyp_cpu_logical_map CHOOSE_NVHE_SYM(hyp_cpu_logical_map) + struct vcpu_reset_state { unsigned long pc; unsigned long r0; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 6a2f4e01b04f..836ca763b91d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -65,8 +65,6 @@ static bool vgic_present; static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); -extern u64 kvm_nvhe_sym(__cpu_logical_map)[NR_CPUS]; - int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; @@ -1602,7 +1600,7 @@ static void init_cpu_logical_map(void) * allow any other CPUs from the `possible` set to boot. */ for_each_online_cpu(cpu) - kvm_nvhe_sym(__cpu_logical_map)[cpu] = cpu_logical_map(cpu); + hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu); } static bool init_psci_relay(void) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c index cbab0c6246e2..2997aa156d8e 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-smp.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-smp.c @@ -14,14 +14,14 @@ * Other CPUs should not be allowed to boot because their features were * not checked against the finalized system capabilities. */ -u64 __ro_after_init __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; +u64 __ro_after_init hyp_cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; u64 cpu_logical_map(unsigned int cpu) { - if (cpu >= ARRAY_SIZE(__cpu_logical_map)) + if (cpu >= ARRAY_SIZE(hyp_cpu_logical_map)) hyp_panic(); - return __cpu_logical_map[cpu]; + return hyp_cpu_logical_map[cpu]; } unsigned long __hyp_per_cpu_offset(unsigned int cpu) diff --git a/arch/arm64/kvm/va_layout.c b/arch/arm64/kvm/va_layout.c index 914732b88c69..70fcd6a12fe1 100644 --- a/arch/arm64/kvm/va_layout.c +++ b/arch/arm64/kvm/va_layout.c @@ -34,17 +34,16 @@ static u64 __early_kern_hyp_va(u64 addr) } /* - * Store a hyp VA <-> PA offset into a hyp-owned variable. + * Store a hyp VA <-> PA offset into a EL2-owned variable. */ static void init_hyp_physvirt_offset(void) { - extern s64 kvm_nvhe_sym(hyp_physvirt_offset); u64 kern_va, hyp_va; /* Compute the offset from the hyp VA and PA of a random symbol. */ kern_va = (u64)lm_alias(__hyp_text_start); hyp_va = __early_kern_hyp_va(kern_va); - CHOOSE_NVHE_SYM(hyp_physvirt_offset) = (s64)__pa(kern_va) - (s64)hyp_va; + hyp_physvirt_offset = (s64)__pa(kern_va) - (s64)hyp_va; } /* From e6829e0384a49efe68537298132230bebd8bd1b3 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 8 Dec 2020 14:24:51 +0000 Subject: [PATCH 109/547] KVM: arm64: Remove unused includes in psci-relay.c Minor cleanup removing unused includes. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201208142452.87237-6-dbrazdil@google.com --- arch/arm64/kvm/hyp/nvhe/psci-relay.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 0d6f4aa39621..1f7237e45148 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -7,11 +7,8 @@ #include #include #include -#include #include #include -#include -#include #include #include From 860a4c3d1e04a3c3e62bacbbba64417bf49768e2 Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Tue, 8 Dec 2020 14:24:52 +0000 Subject: [PATCH 110/547] KVM: arm64: Move skip_host_instruction to adjust_pc.h Move function for skipping host instruction in the host trap handler to a header file containing analogical helpers for guests. Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201208142452.87237-7-dbrazdil@google.com --- arch/arm64/kvm/hyp/include/hyp/adjust_pc.h | 9 +++++++++ arch/arm64/kvm/hyp/nvhe/hyp-main.c | 12 ++---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h index b1f60923a8fe..61716359035d 100644 --- a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h +++ b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h @@ -59,4 +59,13 @@ static inline void __adjust_pc(struct kvm_vcpu *vcpu) } } +/* + * Skip an instruction while host sysregs are live. + * Assumes host is always 64-bit. + */ +static inline void kvm_skip_host_instr(void) +{ + write_sysreg_el2(read_sysreg_el2(SYS_ELR) + 4, SYS_ELR); +} + #endif diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index bde658d51404..a906f9e2ff34 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -157,11 +157,6 @@ static void default_host_smc_handler(struct kvm_cpu_context *host_ctxt) __kvm_hyp_host_forward_smc(host_ctxt); } -static void skip_host_instruction(void) -{ - write_sysreg_el2(read_sysreg_el2(SYS_ELR) + 4, SYS_ELR); -} - static void handle_host_smc(struct kvm_cpu_context *host_ctxt) { bool handled; @@ -170,11 +165,8 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt) if (!handled) default_host_smc_handler(host_ctxt); - /* - * Unlike HVC, the return address of an SMC is the instruction's PC. - * Move the return address past the instruction. - */ - skip_host_instruction(); + /* SMC was trapped, move ELR past the current PC. */ + kvm_skip_host_instr(); } void handle_trap(struct kvm_cpu_context *host_ctxt) From 767c973f2e4a9264a4f159c9fad5ca8acdb9915e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 22 Dec 2020 12:46:41 +0000 Subject: [PATCH 111/547] KVM: arm64: Declutter host PSCI 0.1 handling Although there is nothing wrong with the current host PSCI relay implementation, we can clean it up and remove some of the helpers that do not improve the overall readability of the legacy PSCI 0.1 handling. Opportunity is taken to turn the bitmap into a set of booleans, and creative use of preprocessor macros make init and check more concise/readable. Suggested-by: Mark Rutland Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 11 ++-- arch/arm64/kvm/arm.c | 12 +++-- arch/arm64/kvm/hyp/nvhe/psci-relay.c | 77 +++++++--------------------- 3 files changed, 30 insertions(+), 70 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index bce2452b305c..8fcfab0c2567 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -241,11 +241,6 @@ struct kvm_host_data { struct kvm_pmu_events pmu_events; }; -#define KVM_HOST_PSCI_0_1_CPU_SUSPEND BIT(0) -#define KVM_HOST_PSCI_0_1_CPU_ON BIT(1) -#define KVM_HOST_PSCI_0_1_CPU_OFF BIT(2) -#define KVM_HOST_PSCI_0_1_MIGRATE BIT(3) - struct kvm_host_psci_config { /* PSCI version used by host. */ u32 version; @@ -253,8 +248,10 @@ struct kvm_host_psci_config { /* Function IDs used by host if version is v0.1. */ struct psci_0_1_function_ids function_ids_0_1; - /* Bitmask of functions enabled for v0.1, bits KVM_HOST_PSCI_0_1_*. */ - unsigned int enabled_functions_0_1; + bool psci_0_1_cpu_suspend_implemented; + bool psci_0_1_cpu_on_implemented; + bool psci_0_1_cpu_off_implemented; + bool psci_0_1_migrate_implemented; }; extern struct kvm_host_psci_config kvm_nvhe_sym(kvm_host_psci_config); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 836ca763b91d..e207e4541f55 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1603,6 +1603,9 @@ static void init_cpu_logical_map(void) hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu); } +#define init_psci_0_1_impl_state(config, what) \ + config.psci_0_1_ ## what ## _implemented = psci_ops.what + static bool init_psci_relay(void) { /* @@ -1618,11 +1621,10 @@ static bool init_psci_relay(void) if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) { kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids(); - kvm_host_psci_config.enabled_functions_0_1 = - (psci_ops.cpu_suspend ? KVM_HOST_PSCI_0_1_CPU_SUSPEND : 0) | - (psci_ops.cpu_off ? KVM_HOST_PSCI_0_1_CPU_OFF : 0) | - (psci_ops.cpu_on ? KVM_HOST_PSCI_0_1_CPU_ON : 0) | - (psci_ops.migrate ? KVM_HOST_PSCI_0_1_MIGRATE : 0); + init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend); + init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on); + init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off); + init_psci_0_1_impl_state(kvm_host_psci_config, migrate); } return true; } diff --git a/arch/arm64/kvm/hyp/nvhe/psci-relay.c b/arch/arm64/kvm/hyp/nvhe/psci-relay.c index 1f7237e45148..e3947846ffcb 100644 --- a/arch/arm64/kvm/hyp/nvhe/psci-relay.c +++ b/arch/arm64/kvm/hyp/nvhe/psci-relay.c @@ -43,48 +43,16 @@ struct psci_boot_args { static DEFINE_PER_CPU(struct psci_boot_args, cpu_on_args) = PSCI_BOOT_ARGS_INIT; static DEFINE_PER_CPU(struct psci_boot_args, suspend_args) = PSCI_BOOT_ARGS_INIT; -static u64 get_psci_func_id(struct kvm_cpu_context *host_ctxt) -{ - DECLARE_REG(u64, func_id, host_ctxt, 0); - - return func_id; -} - -static inline bool is_psci_0_1_function_enabled(unsigned int fn_bit) -{ - return kvm_host_psci_config.enabled_functions_0_1 & fn_bit; -} - -static inline bool is_psci_0_1_cpu_suspend(u64 func_id) -{ - return is_psci_0_1_function_enabled(KVM_HOST_PSCI_0_1_CPU_SUSPEND) && - (func_id == kvm_host_psci_config.function_ids_0_1.cpu_suspend); -} - -static inline bool is_psci_0_1_cpu_on(u64 func_id) -{ - return is_psci_0_1_function_enabled(KVM_HOST_PSCI_0_1_CPU_ON) && - (func_id == kvm_host_psci_config.function_ids_0_1.cpu_on); -} - -static inline bool is_psci_0_1_cpu_off(u64 func_id) -{ - return is_psci_0_1_function_enabled(KVM_HOST_PSCI_0_1_CPU_OFF) && - (func_id == kvm_host_psci_config.function_ids_0_1.cpu_off); -} - -static inline bool is_psci_0_1_migrate(u64 func_id) -{ - return is_psci_0_1_function_enabled(KVM_HOST_PSCI_0_1_MIGRATE) && - (func_id == kvm_host_psci_config.function_ids_0_1.migrate); -} +#define is_psci_0_1(what, func_id) \ + (kvm_host_psci_config.psci_0_1_ ## what ## _implemented && \ + (func_id) == kvm_host_psci_config.function_ids_0_1.what) static bool is_psci_0_1_call(u64 func_id) { - return is_psci_0_1_cpu_suspend(func_id) || - is_psci_0_1_cpu_on(func_id) || - is_psci_0_1_cpu_off(func_id) || - is_psci_0_1_migrate(func_id); + return (is_psci_0_1(cpu_suspend, func_id) || + is_psci_0_1(cpu_on, func_id) || + is_psci_0_1(cpu_off, func_id) || + is_psci_0_1(migrate, func_id)); } static bool is_psci_0_2_call(u64 func_id) @@ -94,16 +62,6 @@ static bool is_psci_0_2_call(u64 func_id) (PSCI_0_2_FN64(0) <= func_id && func_id <= PSCI_0_2_FN64(31)); } -static bool is_psci_call(u64 func_id) -{ - switch (kvm_host_psci_config.version) { - case PSCI_VERSION(0, 1): - return is_psci_0_1_call(func_id); - default: - return is_psci_0_2_call(func_id); - } -} - static unsigned long psci_call(unsigned long fn, unsigned long arg0, unsigned long arg1, unsigned long arg2) { @@ -273,14 +231,14 @@ asmlinkage void __noreturn kvm_host_psci_cpu_entry(bool is_cpu_on) static unsigned long psci_0_1_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) { - if (is_psci_0_1_cpu_off(func_id) || is_psci_0_1_migrate(func_id)) + if (is_psci_0_1(cpu_off, func_id) || is_psci_0_1(migrate, func_id)) return psci_forward(host_ctxt); - else if (is_psci_0_1_cpu_on(func_id)) + if (is_psci_0_1(cpu_on, func_id)) return psci_cpu_on(func_id, host_ctxt); - else if (is_psci_0_1_cpu_suspend(func_id)) + if (is_psci_0_1(cpu_suspend, func_id)) return psci_cpu_suspend(func_id, host_ctxt); - else - return PSCI_RET_NOT_SUPPORTED; + + return PSCI_RET_NOT_SUPPORTED; } static unsigned long psci_0_2_handler(u64 func_id, struct kvm_cpu_context *host_ctxt) @@ -322,20 +280,23 @@ static unsigned long psci_1_0_handler(u64 func_id, struct kvm_cpu_context *host_ bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt) { - u64 func_id = get_psci_func_id(host_ctxt); + DECLARE_REG(u64, func_id, host_ctxt, 0); unsigned long ret; - if (!is_psci_call(func_id)) - return false; - switch (kvm_host_psci_config.version) { case PSCI_VERSION(0, 1): + if (!is_psci_0_1_call(func_id)) + return false; ret = psci_0_1_handler(func_id, host_ctxt); break; case PSCI_VERSION(0, 2): + if (!is_psci_0_2_call(func_id)) + return false; ret = psci_0_2_handler(func_id, host_ctxt); break; default: + if (!is_psci_0_2_call(func_id)) + return false; ret = psci_1_0_handler(func_id, host_ctxt); break; } From 8b3fd902391fdee526f6ba46899a3f8005983ae1 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 17 Dec 2020 08:15:01 +0100 Subject: [PATCH 112/547] MAINTAINERS: include governors into CPU IDLE TIME MANAGEMENT FRAMEWORK The current pattern in the file entry does not make the files in the governors subdirectory to be a part of the CPU IDLE TIME MANAGEMENT FRAMEWORK. Adjust the file pattern to include files in governors. Signed-off-by: Lukas Bulwahn Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7e76ae7ee8a2..59be280069d1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4583,7 +4583,7 @@ B: https://bugzilla.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git F: Documentation/admin-guide/pm/cpuidle.rst F: Documentation/driver-api/pm/cpuidle.rst -F: drivers/cpuidle/* +F: drivers/cpuidle/ F: include/linux/cpuidle.h CPU POWER MONITORING SUBSYSTEM From 7887cc89d5851cbdec49219e9614beec776af150 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 14 Dec 2020 23:34:13 +0100 Subject: [PATCH 113/547] ARM: dts: ux500/golden: Set display max brightness A too high brightness by default (default is max) makes the screen go blank. Set this to 15 as in the Vendor tree. Signed-off-by: Linus Walleij Cc: Stephan Gerhold Link: https://lore.kernel.org/r/20201214223413.253893-1-linus.walleij@linaro.org' Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/ste-ux500-samsung-golden.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/boot/dts/ste-ux500-samsung-golden.dts b/arch/arm/boot/dts/ste-ux500-samsung-golden.dts index a1093cb37dc7..aed1f2d5f246 100644 --- a/arch/arm/boot/dts/ste-ux500-samsung-golden.dts +++ b/arch/arm/boot/dts/ste-ux500-samsung-golden.dts @@ -326,6 +326,7 @@ panel@0 { compatible = "samsung,s6e63m0"; reg = <0>; + max-brightness = <15>; vdd3-supply = <&panel_reg_3v0>; vci-supply = <&panel_reg_1v8>; reset-gpios = <&gpio4 11 GPIO_ACTIVE_LOW>; From f87777a3c30cf50c66a20e1d153f0e003bb30774 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sat, 19 Dec 2020 14:50:36 +0100 Subject: [PATCH 114/547] net: stmmac: dwmac-meson8b: ignore the second clock input The dwmac glue registers on Amlogic Meson8b and newer SoCs has two clock inputs: - Meson8b and Meson8m2: MPLL2 and MPLL2 (the same parent is wired to both inputs) - GXBB, GXL, GXM, AXG, G12A, G12B, SM1: FCLK_DIV2 and MPLL2 All known vendor kernels and u-boots are using the first input only. We let the common clock framework automatically choose the "right" parent. For some boards this causes a problem though, specificially with G12A and newer SoCs. The clock input is used for generating the 125MHz RGMII TX clock. For the two input clocks this means on G12A: - FCLK_DIV2: 999999985Hz / 8 = 124999998.125Hz - MPLL2: 499999993Hz / 4 = 124999998.25Hz In theory MPLL2 is the "better" clock input because it's gets us 0.125Hz closer to the requested frequency than FCLK_DIV2. In reality however there is a resource conflict because MPLL2 is needed to generate some of the audio clocks. dwmac-meson8b probes first and sets up the clock tree with MPLL2. This works fine until the audio driver comes and "steals" the MPLL2 clocks and configures it with it's own rate (294909637Hz). The common clock framework happily changes the MPLL2 rate but does not reconfigure our RGMII TX clock tree, which then ends up at 73727409Hz, which is more than 40% off the requested 125MHz. Don't use the second clock input for now to force the common clock framework to always select the first parent. This mimics the behavior from the vendor driver and fixes the clock resource conflict with the audio driver on G12A boards. Once the common clock framework can handle this situation this change can be reverted again. Fixes: 566e8251625304 ("net: stmmac: add a glue driver for the Amlogic Meson 8b / GXBB DWMAC") Reported-by: Thomas Graichen Signed-off-by: Martin Blumenstingl Tested-by: thomas graichen Link: https://lore.kernel.org/r/20201219135036.3216017-1-martin.blumenstingl@googlemail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c index 459ae715b33d..f184b00f5116 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c @@ -135,7 +135,7 @@ static int meson8b_init_rgmii_tx_clk(struct meson8b_dwmac *dwmac) struct device *dev = dwmac->dev; static const struct clk_parent_data mux_parents[] = { { .fw_name = "clkin0", }, - { .fw_name = "clkin1", }, + { .index = -1, }, }; static const struct clk_div_table div_table[] = { { .div = 2, .val = 2, }, From 8b0f64b113d617c995ffdf50196948c3e99c6e49 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 19 Dec 2020 10:55:38 -0800 Subject: [PATCH 115/547] MAINTAINERS: remove names from mailing list maintainers When searching for inactive maintainers it's useful to filter out mailing list addresses. Such "maintainers" will obviously never feature in a "From:" line of an email or a review tag. Since "L:" entries only provide the address of a mailing list without a fancy name extend this pattern to "M:" entries. Alternatively we could reserve M: entries for humans only and move the fake "maintainers" to L:. While I'd personally prefer to reserve M: for humans only, I'm not 100% that's a great choice either, given most L: entries are in fact open mailing lists with public archives. Link: https://lore.kernel.org/r/20201219185538.750076-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 28d7acdb0591..20cd4cc08dd1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -203,8 +203,8 @@ F: include/uapi/linux/nl80211.h F: net/wireless/ 8169 10/100/1000 GIGABIT ETHERNET DRIVER -M: Realtek linux nic maintainers M: Heiner Kallweit +M: nic_swsd@realtek.com L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/realtek/r8169* @@ -2119,7 +2119,7 @@ N: atmel ARM/Microchip Sparx5 SoC support M: Lars Povlsen M: Steen Hegelund -M: Microchip Linux Driver Support +M: UNGLinuxDriver@microchip.com L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported T: git git://github.com/microchip-ung/linux-upstream.git @@ -3958,7 +3958,7 @@ F: net/can/ CAN-J1939 NETWORK LAYER M: Robin van der Gracht M: Oleksij Rempel -R: Pengutronix Kernel Team +R: kernel@pengutronix.de L: linux-can@vger.kernel.org S: Maintained F: Documentation/networking/j1939.rst @@ -11651,7 +11651,7 @@ F: drivers/media/platform/atmel/atmel-isi.h MICROCHIP KSZ SERIES ETHERNET SWITCH DRIVER M: Woojung Huh -M: Microchip Linux Driver Support +M: UNGLinuxDriver@microchip.com L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/dsa/microchip,ksz.yaml @@ -11661,7 +11661,7 @@ F: net/dsa/tag_ksz.c MICROCHIP LAN743X ETHERNET DRIVER M: Bryan Whitehead -M: Microchip Linux Driver Support +M: UNGLinuxDriver@microchip.com L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/microchip/lan743x_* @@ -11755,7 +11755,7 @@ F: drivers/net/wireless/microchip/wilc1000/ MICROSEMI MIPS SOCS M: Alexandre Belloni -M: Microchip Linux Driver Support +M: UNGLinuxDriver@microchip.com L: linux-mips@vger.kernel.org S: Supported F: Documentation/devicetree/bindings/mips/mscc.txt @@ -12809,10 +12809,10 @@ F: tools/objtool/ F: include/linux/objtool.h OCELOT ETHERNET SWITCH DRIVER -M: Microchip Linux Driver Support M: Vladimir Oltean M: Claudiu Manoil M: Alexandre Belloni +M: UNGLinuxDriver@microchip.com L: netdev@vger.kernel.org S: Supported F: drivers/net/dsa/ocelot/* @@ -13874,7 +13874,7 @@ F: drivers/platform/x86/peaq-wmi.c PENSANDO ETHERNET DRIVERS M: Shannon Nelson -M: Pensando Drivers +M: drivers@pensando.io L: netdev@vger.kernel.org S: Supported F: Documentation/networking/device_drivers/ethernet/pensando/ionic.rst @@ -14653,7 +14653,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git F: drivers/net/wireless/ath/ath11k/ QUALCOMM ATHEROS ATH9K WIRELESS DRIVER -M: QCA ath9k Development +M: ath9k-devel@qca.qualcomm.com L: linux-wireless@vger.kernel.org S: Supported W: https://wireless.wiki.kernel.org/en/users/Drivers/ath9k @@ -18338,7 +18338,7 @@ F: include/linux/usb/isp116x.h USB LAN78XX ETHERNET DRIVER M: Woojung Huh -M: Microchip Linux Driver Support +M: UNGLinuxDriver@microchip.com L: netdev@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/microchip,lan78xx.txt @@ -18452,7 +18452,7 @@ F: drivers/net/usb/smsc75xx.* USB SMSC95XX ETHERNET DRIVER M: Steve Glendinning -M: Microchip Linux Driver Support +M: UNGLinuxDriver@microchip.com L: netdev@vger.kernel.org S: Maintained F: drivers/net/usb/smsc95xx.* @@ -18999,7 +18999,7 @@ F: drivers/input/mouse/vmmouse.h VMWARE VMXNET3 ETHERNET DRIVER M: Ronak Doshi -M: "VMware, Inc." +M: pv-drivers@vmware.com L: netdev@vger.kernel.org S: Maintained F: drivers/net/vmxnet3/ From a0c8be56affa7d5ffbdec24c992223be54db3b6e Mon Sep 17 00:00:00 2001 From: Lijun Pan Date: Sat, 19 Dec 2020 15:39:19 -0600 Subject: [PATCH 116/547] ibmvnic: fix login buffer memory leak Commit 34f0f4e3f488 ("ibmvnic: Fix login buffer memory leaks") frees login_rsp_buffer in release_resources() and send_login() because handle_login_rsp() does not free it. Commit f3ae59c0c015 ("ibmvnic: store RX and TX subCRQ handle array in ibmvnic_adapter struct") frees login_rsp_buffer in handle_login_rsp(). It seems unnecessary to free it in release_resources() and send_login(). There are chances that handle_login_rsp returns earlier without freeing buffers. Double-checking the buffer is harmless since release_login_buffer and release_login_rsp_buffer will do nothing if buffer is already freed. Fixes: f3ae59c0c015 ("ibmvnic: store RX and TX subCRQ handle array in ibmvnic_adapter struct") Fixes: 34f0f4e3f488 ("ibmvnic: Fix login buffer memory leaks") Signed-off-by: Lijun Pan Link: https://lore.kernel.org/r/20201219213919.21045-1-ljp@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index f302504faa8a..bd0281020d51 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -955,6 +955,7 @@ static void release_resources(struct ibmvnic_adapter *adapter) release_rx_pools(adapter); release_napi(adapter); + release_login_buffer(adapter); release_login_rsp_buffer(adapter); } @@ -3873,7 +3874,9 @@ static int send_login(struct ibmvnic_adapter *adapter) return -1; } + release_login_buffer(adapter); release_login_rsp_buffer(adapter); + client_data_len = vnic_client_data_len(adapter); buffer_size = From 58f60329a6be35a5653edb3fd2023ccef9eb9943 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sun, 20 Dec 2020 16:29:30 +0800 Subject: [PATCH 117/547] net: ethernet: mvneta: Fix error handling in mvneta_probe When mvneta_port_power_up() fails, we should execute cleanup functions after label err_netdev to avoid memleak. Fixes: 41c2b6b4f0f80 ("net: ethernet: mvneta: Add back interface mode validation") Signed-off-by: Dinghao Liu Link: https://lore.kernel.org/r/20201220082930.21623-1-dinghao.liu@zju.edu.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvneta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 563ceac3060f..3369ec717a51 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -5255,7 +5255,7 @@ static int mvneta_probe(struct platform_device *pdev) err = mvneta_port_power_up(pp, pp->phy_interface); if (err < 0) { dev_err(&pdev->dev, "can't power up port\n"); - return err; + goto err_netdev; } /* Armada3700 network controller does not support per-cpu From 1d898b283576c38dedcb6b21fcbb65968ab03581 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sun, 20 Dec 2020 10:49:47 +0200 Subject: [PATCH 118/547] docs: netdev-FAQ: fix question headers formatting Join adjacent questions to a single question line. This fixes the formatting of questions that were not part of the heading. Also, drop Q: and A: prefixes. We don't need them now that questions and answers are visually separated. Signed-off-by: Baruch Siach Link: https://lore.kernel.org/r/f76078ba5547744f2ec178984c32fbc7dcd29a2b.1608454187.git.baruch@tkos.co.il Signed-off-by: Jakub Kicinski --- Documentation/networking/netdev-FAQ.rst | 126 +++++++++++------------- 1 file changed, 59 insertions(+), 67 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index 4b9ed5874d5a..ae2ae37cd921 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -6,9 +6,9 @@ netdev FAQ ========== -Q: What is netdev? ------------------- -A: It is a mailing list for all network-related Linux stuff. This +What is netdev? +--------------- +It is a mailing list for all network-related Linux stuff. This includes anything found under net/ (i.e. core code like IPv6) and drivers/net (i.e. hardware specific drivers) in the Linux source tree. @@ -25,9 +25,9 @@ Aside from subsystems like that mentioned above, all network-related Linux development (i.e. RFC, review, comments, etc.) takes place on netdev. -Q: How do the changes posted to netdev make their way into Linux? ------------------------------------------------------------------ -A: There are always two trees (git repositories) in play. Both are +How do the changes posted to netdev make their way into Linux? +-------------------------------------------------------------- +There are always two trees (git repositories) in play. Both are driven by David Miller, the main network maintainer. There is the ``net`` tree, and the ``net-next`` tree. As you can probably guess from the names, the ``net`` tree is for fixes to existing code already in the @@ -37,9 +37,9 @@ for the future release. You can find the trees here: - https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git - https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git -Q: How often do changes from these trees make it to the mainline Linus tree? ----------------------------------------------------------------------------- -A: To understand this, you need to know a bit of background information on +How often do changes from these trees make it to the mainline Linus tree? +------------------------------------------------------------------------- +To understand this, you need to know a bit of background information on the cadence of Linux development. Each new release starts off with a two week "merge window" where the main maintainers feed their new stuff to Linus for merging into the mainline tree. After the two weeks, the @@ -81,7 +81,8 @@ focus for ``net`` is on stabilization and bug fixes. Finally, the vX.Y gets released, and the whole cycle starts over. -Q: So where are we now in this cycle? +So where are we now in this cycle? +---------------------------------- Load the mainline (Linus) page here: @@ -91,9 +92,9 @@ and note the top of the "tags" section. If it is rc1, it is early in the dev cycle. If it was tagged rc7 a week ago, then a release is probably imminent. -Q: How do I indicate which tree (net vs. net-next) my patch should be in? -------------------------------------------------------------------------- -A: Firstly, think whether you have a bug fix or new "next-like" content. +How do I indicate which tree (net vs. net-next) my patch should be in? +---------------------------------------------------------------------- +Firstly, think whether you have a bug fix or new "next-like" content. Then once decided, assuming that you use git, use the prefix flag, i.e. :: @@ -105,48 +106,45 @@ in the above is just the subject text of the outgoing e-mail, and you can manually change it yourself with whatever MUA you are comfortable with. -Q: I sent a patch and I'm wondering what happened to it? --------------------------------------------------------- -Q: How can I tell whether it got merged? -A: Start by looking at the main patchworks queue for netdev: +I sent a patch and I'm wondering what happened to it - how can I tell whether it got merged? +-------------------------------------------------------------------------------------------- +Start by looking at the main patchworks queue for netdev: https://patchwork.kernel.org/project/netdevbpf/list/ The "State" field will tell you exactly where things are at with your patch. -Q: The above only says "Under Review". How can I find out more? ----------------------------------------------------------------- -A: Generally speaking, the patches get triaged quickly (in less than +The above only says "Under Review". How can I find out more? +------------------------------------------------------------- +Generally speaking, the patches get triaged quickly (in less than 48h). So be patient. Asking the maintainer for status updates on your patch is a good way to ensure your patch is ignored or pushed to the bottom of the priority list. -Q: I submitted multiple versions of the patch series ----------------------------------------------------- -Q: should I directly update patchwork for the previous versions of these -patch series? -A: No, please don't interfere with the patch status on patchwork, leave +I submitted multiple versions of the patch series. Should I directly update patchwork for the previous versions of these patch series? +-------------------------------------------------------------------------------------------------------------------------------------- +No, please don't interfere with the patch status on patchwork, leave it to the maintainer to figure out what is the most recent and current version that should be applied. If there is any doubt, the maintainer will reply and ask what should be done. -Q: I made changes to only a few patches in a patch series should I resend only those changed? ---------------------------------------------------------------------------------------------- -A: No, please resend the entire patch series and make sure you do number your +I made changes to only a few patches in a patch series should I resend only those changed? +------------------------------------------------------------------------------------------ +No, please resend the entire patch series and make sure you do number your patches such that it is clear this is the latest and greatest set of patches that can be applied. -Q: I submitted multiple versions of a patch series and it looks like a version other than the last one has been accepted, what should I do? -------------------------------------------------------------------------------------------------------------------------------------------- -A: There is no revert possible, once it is pushed out, it stays like that. +I submitted multiple versions of a patch series and it looks like a version other than the last one has been accepted, what should I do? +---------------------------------------------------------------------------------------------------------------------------------------- +There is no revert possible, once it is pushed out, it stays like that. Please send incremental versions on top of what has been merged in order to fix the patches the way they would look like if your latest patch series was to be merged. -Q: How can I tell what patches are queued up for backporting to the various stable releases? --------------------------------------------------------------------------------------------- -A: Normally Greg Kroah-Hartman collects stable commits himself, but for +How can I tell what patches are queued up for backporting to the various stable releases? +----------------------------------------------------------------------------------------- +Normally Greg Kroah-Hartman collects stable commits himself, but for networking, Dave collects up patches he deems critical for the networking subsystem, and then hands them off to Greg. @@ -169,11 +167,9 @@ simply clone the repo, and then git grep the mainline commit ID, e.g. releases/3.9.8/ipv6-fix-possible-crashes-in-ip6_cork_release.patch stable/stable-queue$ -Q: I see a network patch and I think it should be backported to stable. ------------------------------------------------------------------------ -Q: Should I request it via stable@vger.kernel.org like the references in -the kernel's Documentation/process/stable-kernel-rules.rst file say? -A: No, not for networking. Check the stable queues as per above first +I see a network patch and I think it should be backported to stable. Should I request it via stable@vger.kernel.org like the references in the kernel's Documentation/process/stable-kernel-rules.rst file say? +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +No, not for networking. Check the stable queues as per above first to see if it is already queued. If not, then send a mail to netdev, listing the upstream commit ID and why you think it should be a stable candidate. @@ -190,11 +186,9 @@ mainline, the better the odds that it is an OK candidate for stable. So scrambling to request a commit be added the day after it appears should be avoided. -Q: I have created a network patch and I think it should be backported to stable. --------------------------------------------------------------------------------- -Q: Should I add a Cc: stable@vger.kernel.org like the references in the -kernel's Documentation/ directory say? -A: No. See above answer. In short, if you think it really belongs in +I have created a network patch and I think it should be backported to stable. Should I add a Cc: stable@vger.kernel.org like the references in the kernel's Documentation/ directory say? +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +No. See above answer. In short, if you think it really belongs in stable, then ensure you write a decent commit log that describes who gets impacted by the bug fix and how it manifests itself, and when the bug was introduced. If you do that properly, then the commit will get @@ -207,18 +201,18 @@ marker line as described in :ref:`Documentation/process/submitting-patches.rst ` to temporarily embed that information into the patch that you send. -Q: Are all networking bug fixes backported to all stable releases? ------------------------------------------------------------------- -A: Due to capacity, Dave could only take care of the backports for the +Are all networking bug fixes backported to all stable releases? +--------------------------------------------------------------- +Due to capacity, Dave could only take care of the backports for the last two stable releases. For earlier stable releases, each stable branch maintainer is supposed to take care of them. If you find any patch is missing from an earlier stable branch, please notify stable@vger.kernel.org with either a commit ID or a formal patch backported, and CC Dave and other relevant networking developers. -Q: Is the comment style convention different for the networking content? ------------------------------------------------------------------------- -A: Yes, in a largely trivial way. Instead of this:: +Is the comment style convention different for the networking content? +--------------------------------------------------------------------- +Yes, in a largely trivial way. Instead of this:: /* * foobar blah blah blah @@ -231,32 +225,30 @@ it is requested that you make it look like this:: * another line of text */ -Q: I am working in existing code that has the former comment style and not the latter. --------------------------------------------------------------------------------------- -Q: Should I submit new code in the former style or the latter? -A: Make it the latter style, so that eventually all code in the domain +I am working in existing code that has the former comment style and not the latter. Should I submit new code in the former style or the latter? +----------------------------------------------------------------------------------------------------------------------------------------------- +Make it the latter style, so that eventually all code in the domain of netdev is of this format. -Q: I found a bug that might have possible security implications or similar. ---------------------------------------------------------------------------- -Q: Should I mail the main netdev maintainer off-list?** -A: No. The current netdev maintainer has consistently requested that +I found a bug that might have possible security implications or similar. Should I mail the main netdev maintainer off-list? +--------------------------------------------------------------------------------------------------------------------------- +No. The current netdev maintainer has consistently requested that people use the mailing lists and not reach out directly. If you aren't OK with that, then perhaps consider mailing security@kernel.org or reading about http://oss-security.openwall.org/wiki/mailing-lists/distros as possible alternative mechanisms. -Q: What level of testing is expected before I submit my change? ---------------------------------------------------------------- -A: If your changes are against ``net-next``, the expectation is that you +What level of testing is expected before I submit my change? +------------------------------------------------------------ +If your changes are against ``net-next``, the expectation is that you have tested by layering your changes on top of ``net-next``. Ideally you will have done run-time testing specific to your change, but at a minimum, your changes should survive an ``allyesconfig`` and an ``allmodconfig`` build without new warnings or failures. -Q: How do I post corresponding changes to user space components? ----------------------------------------------------------------- -A: User space code exercising kernel features should be posted +How do I post corresponding changes to user space components? +------------------------------------------------------------- +User space code exercising kernel features should be posted alongside kernel patches. This gives reviewers a chance to see how any new interface is used and how well it works. @@ -280,9 +272,9 @@ to the mailing list, e.g.:: Posting as one thread is discouraged because it confuses patchwork (as of patchwork 2.2.2). -Q: Any other tips to help ensure my net/net-next patch gets OK'd? ------------------------------------------------------------------ -A: Attention to detail. Re-read your own work as if you were the +Any other tips to help ensure my net/net-next patch gets OK'd? +-------------------------------------------------------------- +Attention to detail. Re-read your own work as if you were the reviewer. You can start with using ``checkpatch.pl``, perhaps even with the ``--strict`` flag. But do not be mindlessly robotic in doing so. If your change is a bug fix, make sure your commit log indicates the From 5d5647dad259bb416fd5d3d87012760386d97530 Mon Sep 17 00:00:00 2001 From: Manish Chopra Date: Mon, 21 Dec 2020 06:55:30 -0800 Subject: [PATCH 119/547] qede: fix offload for IPIP tunnel packets IPIP tunnels packets are unknown to device, hence these packets are incorrectly parsed and caused the packet corruption, so disable offlods for such packets at run time. Signed-off-by: Manish Chopra Signed-off-by: Sudarsana Kalluru Signed-off-by: Igor Russkikh Link: https://lore.kernel.org/r/20201221145530.7771-1-manishc@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qede/qede_fp.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index a2494bf85007..ca0ee29a57b5 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1799,6 +1799,11 @@ netdev_features_t qede_features_check(struct sk_buff *skb, ntohs(udp_hdr(skb)->dest) != gnv_port)) return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); + } else if (l4_proto == IPPROTO_IPIP) { + /* IPIP tunnels are unknown to the device or at least unsupported natively, + * offloads for them can't be done trivially, so disable them for such skb. + */ + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } } From f86de9b1c0663b0a3ca2dcddec9aa910ff0fbf2c Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Fri, 23 Oct 2020 14:46:47 +0800 Subject: [PATCH 120/547] ALSA: hda/realtek - Fix speaker volume control on Lenovo C940 Cannot adjust speaker's volume on Lenovo C940. Applying the alc298_fixup_speaker_volume function can fix the issue. [ Additional note: C940 has I2S amp for the speaker and this needs the same initialization as Dell machines. The patch was slightly modified so that the quirk entry is moved next to the corresponding Dell quirk entry. -- tiwai ] Signed-off-by: Kailang Yang Cc: Link: https://lore.kernel.org/r/ea25b4e5c468491aa2e9d6cb1f2fced3@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index dde5ba209541..5b03259622a8 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6289,6 +6289,7 @@ enum { ALC221_FIXUP_HP_FRONT_MIC, ALC292_FIXUP_TPT460, ALC298_FIXUP_SPK_VOLUME, + ALC298_FIXUP_LENOVO_SPK_VOLUME, ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER, ALC269_FIXUP_ATIV_BOOK_8, ALC221_FIXUP_HP_MIC_NO_PRESENCE, @@ -7119,6 +7120,10 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC298_FIXUP_DELL_AIO_MIC_NO_PRESENCE, }, + [ALC298_FIXUP_LENOVO_SPK_VOLUME] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc298_fixup_speaker_volume, + }, [ALC295_FIXUP_DISABLE_DAC3] = { .type = HDA_FIXUP_FUNC, .v.func = alc295_fixup_disable_dac3, @@ -8126,6 +8131,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3151, "ThinkCentre Station", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x17aa, 0x3176, "ThinkCentre Station", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x17aa, 0x3178, "ThinkCentre Station", ALC283_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x17aa, 0x3818, "Lenovo C940", ALC298_FIXUP_LENOVO_SPK_VOLUME), SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI), SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC), SND_PCI_QUIRK(0x17aa, 0x3978, "Lenovo B50-70", ALC269_FIXUP_DMIC_THINKPAD_ACPI), From c1e8952395c1f44a6304c71401519d19ed2ac56a Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Wed, 23 Dec 2020 15:34:57 +0800 Subject: [PATCH 121/547] ALSA: hda/realtek - Modify Dell platform name Dell platform SSID:0x0a58 change platform name. Use the generic name instead for avoiding confusion. Fixes: 150927c3674d ("ALSA: hda/realtek - Supported Dell fixed type headset") Signed-off-by: Kailang Yang Cc: Link: https://lore.kernel.org/r/efe7c196158241aa817229df7835d645@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 5b03259622a8..f49cf0f227c4 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7890,7 +7890,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x09bf, "Dell Precision", ALC233_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0a2e, "Dell", ALC236_FIXUP_DELL_AIO_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0a30, "Dell", ALC236_FIXUP_DELL_AIO_HEADSET_MIC), - SND_PCI_QUIRK(0x1028, 0x0a58, "Dell Precision 3650 Tower", ALC255_FIXUP_DELL_HEADSET_MIC), + SND_PCI_QUIRK(0x1028, 0x0a58, "Dell", ALC255_FIXUP_DELL_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), From 3557ae187c32203d1bb8b48ee1e2e7bdb23d98d5 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 1 Dec 2020 15:01:53 +0000 Subject: [PATCH 122/547] KVM: Documentation: Add arm64 KVM_RUN error codes The API documentation states that general error codes are not detailed, but errors with specific meanings are. On arm64, KVM_RUN can return error numbers with a different meaning than what is described by POSIX or the C99 standard (as taken from man 3 errno). Absent from the newly documented error codes is ERANGE which can be returned when making a change to the EL2 stage 1 tables if the address is larger than the largest supported input address. Assuming no bugs in the implementation, that is not possible because the input addresses which are mapped are the result of applying the macro kern_hyp_va() on kernel virtual addresses. CC: Paolo Bonzini Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201201150157.223625-2-alexandru.elisei@arm.com --- Documentation/virt/kvm/api.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index e00a66d72372..4e5316ed10e9 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -380,9 +380,14 @@ This ioctl is obsolete and has been removed. Errors: - ===== ============================= + ======= ============================================================== EINTR an unmasked signal is pending - ===== ============================= + ENOEXEC the vcpu hasn't been initialized or the guest tried to execute + instructions from device memory (arm64) + ENOSYS data abort outside memslots with no syndrome info and + KVM_CAP_ARM_NISV_TO_USER not enabled (arm64) + EPERM SVE feature set but not finalized (arm64) + ======= ============================================================== This ioctl is used to run a guest virtual cpu. While there are no explicit parameters, there is an implicit parameter block that can be From f16570ba47ff2b3766ebeaba6f4b80ad48cfd6a1 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 1 Dec 2020 15:01:54 +0000 Subject: [PATCH 123/547] KVM: arm64: arch_timer: Remove VGIC initialization check kvm_timer_enable() is called in kvm_vcpu_first_run_init() after kvm_vgic_map_resources() if the VGIC wasn't ready. kvm_vgic_map_resources() is the only place where kvm->arch.vgic.ready is set to true. For a v2 VGIC, kvm_vgic_map_resources() will attempt to initialize the VGIC and set the initialized flag. For a v3 VGIC, kvm_vgic_map_resources() will return an error code if the VGIC isn't already initialized. The end result is that if we've reached kvm_timer_enable(), the VGIC is initialzed and ready and vgic_initialized() will always be true, so remove this check. Signed-off-by: Alexandru Elisei Reviewed-by: Eric Auger [maz: added comment about vgic initialisation, as suggested by Eric] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201201150157.223625-3-alexandru.elisei@arm.com --- arch/arm64/kvm/arch_timer.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 32ba6fbc3814..74e0699661e9 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -1129,9 +1129,10 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) if (!irqchip_in_kernel(vcpu->kvm)) goto no_vgic; - if (!vgic_initialized(vcpu->kvm)) - return -ENODEV; - + /* + * At this stage, we have the guarantee that the vgic is both + * available and initialized. + */ if (!timer_irqs_are_valid(vcpu)) { kvm_debug("incorrectly configured timer irqs\n"); return -EINVAL; From 1c91f06d296de4f0c27022f5ec464e047d471215 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 1 Dec 2020 15:01:55 +0000 Subject: [PATCH 124/547] KVM: arm64: Move double-checked lock to kvm_vgic_map_resources() kvm_vgic_map_resources() is called when a VCPU if first run and it maps all the VGIC MMIO regions. To prevent double-initialization, the VGIC uses the ready variable to keep track of the state of resources and the global KVM mutex to protect against concurrent accesses. After the lock is taken, the variable is checked again in case another VCPU took the lock between the current VCPU reading ready equals false and taking the lock. The double-checked lock pattern is spread across four different functions: in kvm_vcpu_first_run_init(), in kvm_vgic_map_resource() and in vgic_{v2,v3}_map_resources(), which makes it hard to reason about and introduces minor code duplication. Consolidate the checks in kvm_vgic_map_resources(), where the lock is taken. No functional change intended. Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201201150157.223625-4-alexandru.elisei@arm.com --- arch/arm64/kvm/arm.c | 8 +++----- arch/arm64/kvm/vgic/vgic-init.c | 6 ++++++ arch/arm64/kvm/vgic/vgic-v2.c | 3 --- arch/arm64/kvm/vgic/vgic-v3.c | 3 --- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e207e4541f55..ab782c480e9a 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -580,11 +580,9 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) * Map the VGIC hardware resources before running a vcpu the * first time on this VM. */ - if (unlikely(!vgic_ready(kvm))) { - ret = kvm_vgic_map_resources(kvm); - if (ret) - return ret; - } + ret = kvm_vgic_map_resources(kvm); + if (ret) + return ret; } else { /* * Tell the rest of the code that there are userspace irqchip diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 32e32d67a127..a2f4d1c85f00 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -428,7 +428,13 @@ int kvm_vgic_map_resources(struct kvm *kvm) struct vgic_dist *dist = &kvm->arch.vgic; int ret = 0; + if (likely(vgic_ready(kvm))) + return 0; + mutex_lock(&kvm->lock); + if (vgic_ready(kvm)) + goto out; + if (!irqchip_in_kernel(kvm)) goto out; diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index ebf53a4e1296..7f38c1a93639 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -306,9 +306,6 @@ int vgic_v2_map_resources(struct kvm *kvm) struct vgic_dist *dist = &kvm->arch.vgic; int ret = 0; - if (vgic_ready(kvm)) - goto out; - if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) { kvm_err("Need to set vgic cpu and dist addresses first\n"); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 9cdf39a94a63..35029c5cb0f1 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -500,9 +500,6 @@ int vgic_v3_map_resources(struct kvm *kvm) int ret = 0; int c; - if (vgic_ready(kvm)) - goto out; - kvm_for_each_vcpu(c, vcpu, kvm) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; From de33212f768c5d9e2fe791b008cb26f92f0aa31c Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Tue, 22 Dec 2020 21:54:21 -0500 Subject: [PATCH 125/547] virtio_net: Fix recursive call to cpus_read_lock() virtnet_set_channels can recursively call cpus_read_lock if CONFIG_XPS and CONFIG_HOTPLUG are enabled. The path is: virtnet_set_channels - calls get_online_cpus(), which is a trivial wrapper around cpus_read_lock() netif_set_real_num_tx_queues netif_reset_xps_queues_gt netif_reset_xps_queues - calls cpus_read_lock() This call chain and potential deadlock happens when the number of TX queues is reduced. This commit the removes netif_set_real_num_[tr]x_queues calls from inside the get/put_online_cpus section, as they don't require that it be held. Fixes: 47be24796c13 ("virtio-net: fix the set affinity bug when CPU IDs are not consecutive") Signed-off-by: Jeff Dike Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20201223025421.671-1-jdike@akamai.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 052975ea0af4..ee611d107294 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2093,14 +2093,16 @@ static int virtnet_set_channels(struct net_device *dev, get_online_cpus(); err = _virtnet_set_queues(vi, queue_pairs); - if (!err) { - netif_set_real_num_tx_queues(dev, queue_pairs); - netif_set_real_num_rx_queues(dev, queue_pairs); - - virtnet_set_affinity(vi); + if (err) { + put_online_cpus(); + goto err; } + virtnet_set_affinity(vi); put_online_cpus(); + netif_set_real_num_tx_queues(dev, queue_pairs); + netif_set_real_num_rx_queues(dev, queue_pairs); + err: return err; } From c06ccf3ebb7503706ea49fd248e709287ef385a3 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 23 Dec 2020 18:45:57 +0100 Subject: [PATCH 126/547] ALSA: usb-audio: Fix UBSAN warnings for MIDI jacks The calculation of in_cables and out_cables bitmaps are done with the bit shift by the value from the descriptor, which is an arbitrary value, and can lead to UBSAN shift-out-of-bounds warnings. Fix it by filtering the bad descriptor values with the check of the upper bound 0x10 (the cable bitmaps are 16 bits). Reported-by: syzbot+92e45ae45543f89e8c88@syzkaller.appspotmail.com Cc: Link: https://lore.kernel.org/r/20201223174557.10249-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/midi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/usb/midi.c b/sound/usb/midi.c index c8213652470c..0c23fa6d8525 100644 --- a/sound/usb/midi.c +++ b/sound/usb/midi.c @@ -1889,6 +1889,8 @@ static int snd_usbmidi_get_ms_info(struct snd_usb_midi *umidi, ms_ep = find_usb_ms_endpoint_descriptor(hostep); if (!ms_ep) continue; + if (ms_ep->bNumEmbMIDIJack > 0x10) + continue; if (usb_endpoint_dir_out(ep)) { if (endpoints[epidx].out_ep) { if (++epidx >= MIDI_MAX_ENDPOINTS) { @@ -2141,6 +2143,8 @@ static int snd_usbmidi_detect_roland(struct snd_usb_midi *umidi, cs_desc[1] == USB_DT_CS_INTERFACE && cs_desc[2] == 0xf1 && cs_desc[3] == 0x02) { + if (cs_desc[4] > 0x10 || cs_desc[5] > 0x10) + continue; endpoint->in_cables = (1 << cs_desc[4]) - 1; endpoint->out_cables = (1 << cs_desc[5]) - 1; return snd_usbmidi_detect_endpoints(umidi, endpoint, 1); From b250bf5f924f7b42725fc9e4135aa0b667dfb119 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 22 Dec 2020 09:16:13 -0600 Subject: [PATCH 127/547] net: ipa: fix interconnect enable bug When the core clock rate and interconnect bandwidth specifications were moved into configuration data, a copy/paste bug was introduced, causing the memory interconnect bandwidth to be set three times rather than enabling the three different interconnects. Fix this bug. Fixes: 91d02f9551501 ("net: ipa: use config data for clocking") Signed-off-by: Alex Elder Reviewed-by: Georgi Djakov Link: https://lore.kernel.org/r/20201222151613.5730-1-elder@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ipa/ipa_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ipa/ipa_clock.c b/drivers/net/ipa/ipa_clock.c index 9dcf16f399b7..135c393437f1 100644 --- a/drivers/net/ipa/ipa_clock.c +++ b/drivers/net/ipa/ipa_clock.c @@ -115,13 +115,13 @@ static int ipa_interconnect_enable(struct ipa *ipa) return ret; data = &clock->interconnect_data[IPA_INTERCONNECT_IMEM]; - ret = icc_set_bw(clock->memory_path, data->average_rate, + ret = icc_set_bw(clock->imem_path, data->average_rate, data->peak_rate); if (ret) goto err_memory_path_disable; data = &clock->interconnect_data[IPA_INTERCONNECT_CONFIG]; - ret = icc_set_bw(clock->memory_path, data->average_rate, + ret = icc_set_bw(clock->config_path, data->average_rate, data->peak_rate); if (ret) goto err_imem_path_disable; From 8450e23f142f629e40bd67afc8375c86c7fbf8f1 Mon Sep 17 00:00:00 2001 From: Noor Azura Ahmad Tarmizi Date: Wed, 23 Dec 2020 00:03:37 +0800 Subject: [PATCH 128/547] stmmac: intel: Add PCI IDs for TGL-H platform Add TGL-H PCI info and PCI IDs for the new TSN Controller to the list of supported devices. Signed-off-by: Noor Azura Ahmad Tarmizi Signed-off-by: Voon Weifeng Signed-off-by: Muhammad Husaini Zulkifli Link: https://lore.kernel.org/r/20201222160337.30870-1-muhammad.husaini.zulkifli@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index a2e80c89de2d..9a6a519426a0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -721,6 +721,8 @@ static SIMPLE_DEV_PM_OPS(intel_eth_pm_ops, intel_eth_pci_suspend, #define PCI_DEVICE_ID_INTEL_EHL_PSE1_RGMII1G_ID 0x4bb0 #define PCI_DEVICE_ID_INTEL_EHL_PSE1_SGMII1G_ID 0x4bb1 #define PCI_DEVICE_ID_INTEL_EHL_PSE1_SGMII2G5_ID 0x4bb2 +#define PCI_DEVICE_ID_INTEL_TGLH_SGMII1G_0_ID 0x43ac +#define PCI_DEVICE_ID_INTEL_TGLH_SGMII1G_1_ID 0x43a2 #define PCI_DEVICE_ID_INTEL_TGL_SGMII1G_ID 0xa0ac static const struct pci_device_id intel_eth_pci_id_table[] = { @@ -735,6 +737,8 @@ static const struct pci_device_id intel_eth_pci_id_table[] = { { PCI_DEVICE_DATA(INTEL, EHL_PSE1_SGMII1G_ID, &ehl_pse1_sgmii1g_info) }, { PCI_DEVICE_DATA(INTEL, EHL_PSE1_SGMII2G5_ID, &ehl_pse1_sgmii1g_info) }, { PCI_DEVICE_DATA(INTEL, TGL_SGMII1G_ID, &tgl_sgmii1g_info) }, + { PCI_DEVICE_DATA(INTEL, TGLH_SGMII1G_0_ID, &tgl_sgmii1g_info) }, + { PCI_DEVICE_DATA(INTEL, TGLH_SGMII1G_1_ID, &tgl_sgmii1g_info) }, {} }; MODULE_DEVICE_TABLE(pci, intel_eth_pci_id_table); From 94ad8f3ac6aff5acde3f6c4719997efc61e0dccf Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 22 Dec 2020 12:00:10 -0600 Subject: [PATCH 129/547] net: ipa: clear pending interrupts before enabling We enable the completion interrupt for channel or event ring commands only when we issue them. The interrupt is disabled after the interrupt has fired, or after we have timed out waiting for it. If we time out, the command could complete after the interrupt has been disabled, causing a state change in the channel or event ring. The interrupt associated with that state change would be delivered the next time the completion interrupt is enabled. To avoid previous command completions interfering with new commands, clear all pending completion interrupts before re-enabling them for a new command. Fixes: b4175f8731f78 ("net: ipa: only enable GSI event control IRQs when needed") Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- drivers/net/ipa/gsi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index c4795249719d..4aee60d62ab0 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -340,7 +340,13 @@ static int evt_ring_command(struct gsi *gsi, u32 evt_ring_id, * is issued here. Only permit *this* event ring to trigger * an interrupt, and only enable the event control IRQ type * when we expect it to occur. + * + * There's a small chance that a previous command completed + * after the interrupt was disabled, so make sure we have no + * pending interrupts before we enable them. */ + iowrite32(~0, gsi->virt + GSI_CNTXT_SRC_EV_CH_IRQ_CLR_OFFSET); + val = BIT(evt_ring_id); iowrite32(val, gsi->virt + GSI_CNTXT_SRC_EV_CH_IRQ_MSK_OFFSET); gsi_irq_type_enable(gsi, GSI_EV_CTRL); @@ -453,7 +459,13 @@ gsi_channel_command(struct gsi_channel *channel, enum gsi_ch_cmd_opcode opcode) * issued here. So we only permit *this* channel to trigger * an interrupt and only enable the channel control IRQ type * when we expect it to occur. + * + * There's a small chance that a previous command completed + * after the interrupt was disabled, so make sure we have no + * pending interrupts before we enable them. */ + iowrite32(~0, gsi->virt + GSI_CNTXT_SRC_CH_IRQ_CLR_OFFSET); + val = BIT(channel_id); iowrite32(val, gsi->virt + GSI_CNTXT_SRC_CH_IRQ_MSK_OFFSET); gsi_irq_type_enable(gsi, GSI_CH_CTRL); From 6ffddf3b3d182d886d754cfafdf909ccb14f464b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 22 Dec 2020 12:00:11 -0600 Subject: [PATCH 130/547] net: ipa: use state to determine channel command success The result of issuing a channel control command should be that the channel changes state. If enabled, a completion interrupt signals that the channel state has changed. This interrupt is enabled by gsi_channel_command() and disabled again after the command has completed (or we time out). There is a window of time--after the completion interrupt is disabled but before the channel state is read--during which the command could complete successfully without interrupting. This would cause the channel to transition to the desired new state. So whether a channel command ends via completion interrupt or timeout, we can consider the command successful if the channel has entered the desired state (and a failure if it has not, regardless of the cause). Fixes: d6c9e3f506ae8 ("net: ipa: only enable generic command completion IRQ when needed"); Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- drivers/net/ipa/gsi.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index 4aee60d62ab0..4f0e79176423 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -505,15 +505,15 @@ static int gsi_channel_alloc_command(struct gsi *gsi, u32 channel_id) ret = gsi_channel_command(channel, GSI_CH_ALLOCATE); - /* Channel state will normally have been updated */ + /* If successful the channel state will have changed */ state = gsi_channel_state(channel); - if (!ret && state != GSI_CHANNEL_STATE_ALLOCATED) { - dev_err(dev, "channel %u bad state %u after alloc\n", - channel_id, state); - ret = -EIO; - } + if (state == GSI_CHANNEL_STATE_ALLOCATED) + return 0; - return ret; + dev_err(dev, "channel %u bad state %u after alloc\n", + channel_id, state); + + return -EIO; } /* Start an ALLOCATED channel */ @@ -533,15 +533,15 @@ static int gsi_channel_start_command(struct gsi_channel *channel) ret = gsi_channel_command(channel, GSI_CH_START); - /* Channel state will normally have been updated */ + /* If successful the channel state will have changed */ state = gsi_channel_state(channel); - if (!ret && state != GSI_CHANNEL_STATE_STARTED) { - dev_err(dev, "channel %u bad state %u after start\n", - gsi_channel_id(channel), state); - ret = -EIO; - } + if (state == GSI_CHANNEL_STATE_STARTED) + return 0; - return ret; + dev_err(dev, "channel %u bad state %u after start\n", + gsi_channel_id(channel), state); + + return -EIO; } /* Stop a GSI channel in STARTED state */ @@ -568,10 +568,10 @@ static int gsi_channel_stop_command(struct gsi_channel *channel) ret = gsi_channel_command(channel, GSI_CH_STOP); - /* Channel state will normally have been updated */ + /* If successful the channel state will have changed */ state = gsi_channel_state(channel); - if (ret || state == GSI_CHANNEL_STATE_STOPPED) - return ret; + if (state == GSI_CHANNEL_STATE_STOPPED) + return 0; /* We may have to try again if stop is in progress */ if (state == GSI_CHANNEL_STATE_STOP_IN_PROC) @@ -604,9 +604,9 @@ static void gsi_channel_reset_command(struct gsi_channel *channel) ret = gsi_channel_command(channel, GSI_CH_RESET); - /* Channel state will normally have been updated */ + /* If successful the channel state will have changed */ state = gsi_channel_state(channel); - if (!ret && state != GSI_CHANNEL_STATE_ALLOCATED) + if (state != GSI_CHANNEL_STATE_ALLOCATED) dev_err(dev, "channel %u bad state %u after reset\n", gsi_channel_id(channel), state); } @@ -628,9 +628,10 @@ static void gsi_channel_de_alloc_command(struct gsi *gsi, u32 channel_id) ret = gsi_channel_command(channel, GSI_CH_DE_ALLOC); - /* Channel state will normally have been updated */ + /* If successful the channel state will have changed */ state = gsi_channel_state(channel); - if (!ret && state != GSI_CHANNEL_STATE_NOT_ALLOCATED) + + if (state != GSI_CHANNEL_STATE_NOT_ALLOCATED) dev_err(dev, "channel %u bad state %u after dealloc\n", channel_id, state); } From 428b448ee764a264b7a2eeed295b282755114aa7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 22 Dec 2020 12:00:12 -0600 Subject: [PATCH 131/547] net: ipa: use state to determine event ring command success This patch implements the same basic fix for event rings as the previous one does for channels. The result of issuing an event ring control command should be that the event ring changes state. If enabled, a completion interrupt signals that the event ring state has changed. This interrupt is enabled by gsi_evt_ring_command() and disabled again after the command has completed (or we time out). There is a window of time during which the command could complete successfully without interrupting. This would cause the event ring to transition to the desired new state. So whether a event ring command ends via completion interrupt or timeout, we can consider the command successful if the event ring has entered the desired state (and a failure if it has not, regardless of the cause). Fixes: b4175f8731f78 ("net: ipa: only enable GSI event control IRQs when needed") Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- drivers/net/ipa/gsi.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index 4f0e79176423..579cc3e51677 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -384,13 +384,15 @@ static int gsi_evt_ring_alloc_command(struct gsi *gsi, u32 evt_ring_id) } ret = evt_ring_command(gsi, evt_ring_id, GSI_EVT_ALLOCATE); - if (!ret && evt_ring->state != GSI_EVT_RING_STATE_ALLOCATED) { - dev_err(gsi->dev, "event ring %u bad state %u after alloc\n", - evt_ring_id, evt_ring->state); - ret = -EIO; - } - return ret; + /* If successful the event ring state will have changed */ + if (evt_ring->state == GSI_EVT_RING_STATE_ALLOCATED) + return 0; + + dev_err(gsi->dev, "event ring %u bad state %u after alloc\n", + evt_ring_id, evt_ring->state); + + return -EIO; } /* Reset a GSI event ring in ALLOCATED or ERROR state. */ @@ -408,9 +410,13 @@ static void gsi_evt_ring_reset_command(struct gsi *gsi, u32 evt_ring_id) } ret = evt_ring_command(gsi, evt_ring_id, GSI_EVT_RESET); - if (!ret && evt_ring->state != GSI_EVT_RING_STATE_ALLOCATED) - dev_err(gsi->dev, "event ring %u bad state %u after reset\n", - evt_ring_id, evt_ring->state); + + /* If successful the event ring state will have changed */ + if (evt_ring->state == GSI_EVT_RING_STATE_ALLOCATED) + return; + + dev_err(gsi->dev, "event ring %u bad state %u after reset\n", + evt_ring_id, evt_ring->state); } /* Issue a hardware de-allocation request for an allocated event ring */ @@ -426,9 +432,13 @@ static void gsi_evt_ring_de_alloc_command(struct gsi *gsi, u32 evt_ring_id) } ret = evt_ring_command(gsi, evt_ring_id, GSI_EVT_DE_ALLOC); - if (!ret && evt_ring->state != GSI_EVT_RING_STATE_NOT_ALLOCATED) - dev_err(gsi->dev, "event ring %u bad state %u after dealloc\n", - evt_ring_id, evt_ring->state); + + /* If successful the event ring state will have changed */ + if (evt_ring->state == GSI_EVT_RING_STATE_NOT_ALLOCATED) + return; + + dev_err(gsi->dev, "event ring %u bad state %u after dealloc\n", + evt_ring_id, evt_ring->state); } /* Fetch the current state of a channel from hardware */ From 826f328e2b7e8854dd42ea44e6519cd75018e7b1 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 22 Dec 2020 22:49:44 +0100 Subject: [PATCH 132/547] net: dcb: Validate netlink message in DCB handler DCB uses the same handler function for both RTM_GETDCB and RTM_SETDCB messages. dcb_doit() bounces RTM_SETDCB mesasges if the user does not have the CAP_NET_ADMIN capability. However, the operation to be performed is not decided from the DCB message type, but from the DCB command. Thus DCB_CMD_*_GET commands are used for reading DCB objects, the corresponding SET and DEL commands are used for manipulation. The assumption is that set-like commands will be sent via an RTM_SETDCB message, and get-like ones via RTM_GETDCB. However, this assumption is not enforced. It is therefore possible to manipulate DCB objects without CAP_NET_ADMIN capability by sending the corresponding command in an RTM_GETDCB message. That is a bug. Fix it by validating the type of the request message against the type used for the response. Fixes: 2f90b8657ec9 ("ixgbe: this patch adds support for DCB to the kernel and ixgbe driver") Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/a2a9b88418f3a58ef211b718f2970128ef9e3793.1608673640.git.me@pmachata.org Signed-off-by: Jakub Kicinski --- net/dcb/dcbnl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 084e159a12ba..7d49b6fd6cef 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1765,6 +1765,8 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, fn = &reply_funcs[dcb->cmd]; if (!fn->cb) return -EOPNOTSUPP; + if (fn->type != nlh->nlmsg_type) + return -EPERM; if (!tb[DCB_ATTR_IFNAME]) return -EINVAL; From 427c940558560bff2583d07fc119a21094675982 Mon Sep 17 00:00:00 2001 From: John Wang Date: Wed, 23 Dec 2020 13:55:23 +0800 Subject: [PATCH 133/547] net/ncsi: Use real net-device for response handler When aggregating ncsi interfaces and dedicated interfaces to bond interfaces, the ncsi response handler will use the wrong net device to find ncsi_dev, so that the ncsi interface will not work properly. Here, we use the original net device to fix it. Fixes: 138635cc27c9 ("net/ncsi: NCSI response packet handler") Signed-off-by: John Wang Link: https://lore.kernel.org/r/20201223055523.2069-1-wangzhiqiang.bj@bytedance.com Signed-off-by: Jakub Kicinski --- net/ncsi/ncsi-rsp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 5b1f4ec66dd9..888ccc2d4e34 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -1120,7 +1120,7 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, int payload, i, ret; /* Find the NCSI device */ - nd = ncsi_find_dev(dev); + nd = ncsi_find_dev(orig_dev); ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; if (!ndp) return -ENODEV; From 5d41f9b7ee7a5a5138894f58846a4ffed601498a Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Wed, 23 Dec 2020 19:06:12 +0800 Subject: [PATCH 134/547] net: ethernet: Fix memleak in ethoc_probe When mdiobus_register() fails, priv->mdio allocated by mdiobus_alloc() has not been freed, which leads to memleak. Fixes: e7f4dc3536a4 ("mdio: Move allocation of interrupts into core") Signed-off-by: Dinghao Liu Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20201223110615.31389-1-dinghao.liu@zju.edu.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ethoc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c index 0981fe9652e5..3d9b0b161e24 100644 --- a/drivers/net/ethernet/ethoc.c +++ b/drivers/net/ethernet/ethoc.c @@ -1211,7 +1211,7 @@ static int ethoc_probe(struct platform_device *pdev) ret = mdiobus_register(priv->mdio); if (ret) { dev_err(&netdev->dev, "failed to register MDIO bus\n"); - goto free2; + goto free3; } ret = ethoc_mdio_probe(netdev); @@ -1243,6 +1243,7 @@ error2: netif_napi_del(&priv->napi); error: mdiobus_unregister(priv->mdio); +free3: mdiobus_free(priv->mdio); free2: clk_disable_unprepare(priv->clk); From 1f45dc22066797479072978feeada0852502e180 Mon Sep 17 00:00:00 2001 From: Lijun Pan Date: Wed, 23 Dec 2020 14:49:04 -0600 Subject: [PATCH 135/547] ibmvnic: continue fatal error reset after passive init Commit f9c6cea0b385 ("ibmvnic: Skip fatal error reset after passive init") says "If the passive CRQ initialization occurs before the FATAL reset task is processed, the FATAL error reset task would try to access a CRQ message queue that was freed, causing an oops. The problem may be most likely to occur during DLPAR add vNIC with a non-default MTU, because the DLPAR process will automatically issue a change MTU request. Fix this by not processing fatal error reset if CRQ is passively initialized after client-driven CRQ initialization fails." The original commit skips a specific reset condition, but that does not fix the problem it claims to fix, and misses a reset condition. The effective fix is commit 0e435befaea4 ("ibmvnic: fix NULL pointer dereference in ibmvic_reset_crq") and commit a0faaa27c716 ("ibmvnic: fix NULL pointer dereference in reset_sub_crq_queues"). With above two fixes, there are no more crashes seen as described even without the original commit, so I would like to revert the original commit. Fixes: f9c6cea0b385 ("ibmvnic: Skip fatal error reset after passive init") Signed-off-by: Lijun Pan Link: https://lore.kernel.org/r/20201223204904.12677-1-ljp@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index bd0281020d51..91ebcde30292 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2342,8 +2342,7 @@ static void __ibmvnic_reset(struct work_struct *work) set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(60 * HZ); } - } else if (!(rwi->reset_reason == VNIC_RESET_FATAL && - adapter->from_passive_init)) { + } else { rc = do_reset(adapter, rwi, reset_state); } kfree(rwi); From 808e0d8832cc81738f3e8df12dff0688352baf50 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 14 Dec 2020 13:29:32 -0600 Subject: [PATCH 136/547] e1000e: Only run S0ix flows if shutdown succeeded If the shutdown failed, the part will be thawed and running S0ix flows will put it into an undefined state. Reported-by: Alexander Duyck Reviewed-by: Alexander Duyck Signed-off-by: Mario Limonciello Tested-by: Yijun Shen Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/netdev.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 128ab6898070..6588f5d4a2be 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6970,13 +6970,14 @@ static __maybe_unused int e1000e_pm_suspend(struct device *dev) e1000e_pm_freeze(dev); rc = __e1000_shutdown(pdev, false); - if (rc) + if (rc) { e1000e_pm_thaw(dev); - - /* Introduce S0ix implementation */ - if (hw->mac.type >= e1000_pch_cnp && - !e1000e_check_me(hw->adapter->pdev->device)) - e1000e_s0ix_entry_flow(adapter); + } else { + /* Introduce S0ix implementation */ + if (hw->mac.type >= e1000_pch_cnp && + !e1000e_check_me(hw->adapter->pdev->device)) + e1000e_s0ix_entry_flow(adapter); + } return rc; } From 3cf31b1a9effd859bb3d6ff9f8b5b0d5e6cac952 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 14 Dec 2020 13:29:33 -0600 Subject: [PATCH 137/547] e1000e: bump up timeout to wait when ME un-configures ULP mode Per guidance from Intel ethernet architecture team, it may take up to 1 second for unconfiguring ULP mode. However in practice this seems to be taking up to 2 seconds on some Lenovo machines. Detect scenarios that take more than 1 second but less than 2.5 seconds and emit a warning on resume for those scenarios. Suggested-by: Aaron Ma Suggested-by: Sasha Netfin Suggested-by: Hans de Goede CC: Mark Pearson Fixes: f15bb6dde738cc8fa0 ("e1000e: Add support for S0ix") BugLink: https://bugs.launchpad.net/bugs/1865570 Link: https://patchwork.ozlabs.org/project/intel-wired-lan/patch/20200323191639.48826-1-aaron.ma@canonical.com/ Link: https://lkml.org/lkml/2020/12/13/15 Link: https://lkml.org/lkml/2020/12/14/708 Signed-off-by: Mario Limonciello Reviewed-by: Hans de Goede Tested-by: Yijun Shen Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index 9aa6fad8ed47..6fb46682b058 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1240,6 +1240,9 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw *hw, bool force) return 0; if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID) { + struct e1000_adapter *adapter = hw->adapter; + bool firmware_bug = false; + if (force) { /* Request ME un-configure ULP mode in the PHY */ mac_reg = er32(H2ME); @@ -1248,16 +1251,24 @@ static s32 e1000_disable_ulp_lpt_lp(struct e1000_hw *hw, bool force) ew32(H2ME, mac_reg); } - /* Poll up to 300msec for ME to clear ULP_CFG_DONE. */ + /* Poll up to 2.5 seconds for ME to clear ULP_CFG_DONE. + * If this takes more than 1 second, show a warning indicating a + * firmware bug + */ while (er32(FWSM) & E1000_FWSM_ULP_CFG_DONE) { - if (i++ == 30) { + if (i++ == 250) { ret_val = -E1000_ERR_PHY; goto out; } + if (i > 100 && !firmware_bug) + firmware_bug = true; usleep_range(10000, 11000); } - e_dbg("ULP_CONFIG_DONE cleared after %dmsec\n", i * 10); + if (firmware_bug) + e_warn("ULP_CONFIG_DONE took %dmsec. This is a firmware bug\n", i * 10); + else + e_dbg("ULP_CONFIG_DONE cleared after %dmsec\n", i * 10); if (force) { mac_reg = er32(H2ME); From 6cecf02e77ab9bf97e9252f9fcb8f0738a6de12c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 14 Dec 2020 13:29:34 -0600 Subject: [PATCH 138/547] Revert "e1000e: disable s0ix entry and exit flows for ME systems" commit e086ba2fccda ("e1000e: disable s0ix entry and exit flows for ME systems") disabled s0ix flows for systems that have various incarnations of the i219-LM ethernet controller. This changed caused power consumption regressions on the following shipping Dell Comet Lake based laptops: * Latitude 5310 * Latitude 5410 * Latitude 5410 * Latitude 5510 * Precision 3550 * Latitude 5411 * Latitude 5511 * Precision 3551 * Precision 7550 * Precision 7750 This commit was introduced because of some regressions on certain Thinkpad laptops. This comment was potentially caused by an earlier commit 632fbd5eb5b0e ("e1000e: fix S0ix flows for cable connected case"). or it was possibly caused by a system not meeting platform architectural requirements for low power consumption. Other changes made in the driver with extended timeouts are expected to make the driver more impervious to platform firmware behavior. Fixes: e086ba2fccda ("e1000e: disable s0ix entry and exit flows for ME systems") Reviewed-by: Alexander Duyck Signed-off-by: Mario Limonciello Reviewed-by: Hans de Goede Tested-by: Yijun Shen Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/netdev.c | 45 +--------------------- 1 file changed, 2 insertions(+), 43 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 6588f5d4a2be..b9800ba2006c 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -103,45 +103,6 @@ static const struct e1000_reg_info e1000_reg_info_tbl[] = { {0, NULL} }; -struct e1000e_me_supported { - u16 device_id; /* supported device ID */ -}; - -static const struct e1000e_me_supported me_supported[] = { - {E1000_DEV_ID_PCH_LPT_I217_LM}, - {E1000_DEV_ID_PCH_LPTLP_I218_LM}, - {E1000_DEV_ID_PCH_I218_LM2}, - {E1000_DEV_ID_PCH_I218_LM3}, - {E1000_DEV_ID_PCH_SPT_I219_LM}, - {E1000_DEV_ID_PCH_SPT_I219_LM2}, - {E1000_DEV_ID_PCH_LBG_I219_LM3}, - {E1000_DEV_ID_PCH_SPT_I219_LM4}, - {E1000_DEV_ID_PCH_SPT_I219_LM5}, - {E1000_DEV_ID_PCH_CNP_I219_LM6}, - {E1000_DEV_ID_PCH_CNP_I219_LM7}, - {E1000_DEV_ID_PCH_ICP_I219_LM8}, - {E1000_DEV_ID_PCH_ICP_I219_LM9}, - {E1000_DEV_ID_PCH_CMP_I219_LM10}, - {E1000_DEV_ID_PCH_CMP_I219_LM11}, - {E1000_DEV_ID_PCH_CMP_I219_LM12}, - {E1000_DEV_ID_PCH_TGP_I219_LM13}, - {E1000_DEV_ID_PCH_TGP_I219_LM14}, - {E1000_DEV_ID_PCH_TGP_I219_LM15}, - {0} -}; - -static bool e1000e_check_me(u16 device_id) -{ - struct e1000e_me_supported *id; - - for (id = (struct e1000e_me_supported *)me_supported; - id->device_id; id++) - if (device_id == id->device_id) - return true; - - return false; -} - /** * __ew32_prepare - prepare to write to MAC CSR register on certain parts * @hw: pointer to the HW structure @@ -6974,8 +6935,7 @@ static __maybe_unused int e1000e_pm_suspend(struct device *dev) e1000e_pm_thaw(dev); } else { /* Introduce S0ix implementation */ - if (hw->mac.type >= e1000_pch_cnp && - !e1000e_check_me(hw->adapter->pdev->device)) + if (hw->mac.type >= e1000_pch_cnp) e1000e_s0ix_entry_flow(adapter); } @@ -6991,8 +6951,7 @@ static __maybe_unused int e1000e_pm_resume(struct device *dev) int rc; /* Introduce S0ix implementation */ - if (hw->mac.type >= e1000_pch_cnp && - !e1000e_check_me(hw->adapter->pdev->device)) + if (hw->mac.type >= e1000_pch_cnp) e1000e_s0ix_exit_flow(adapter); rc = __e1000_resume(pdev); From 3c98cbf22a96c1b12f48c1b2a4680dfe5cb280f9 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 14 Dec 2020 13:29:35 -0600 Subject: [PATCH 139/547] e1000e: Export S0ix flags to ethtool This flag can be used by an end user to disable S0ix flows on a buggy system or by an OEM for development purposes. If you need this flag to be persisted across reboots, it's suggested to use a udev rule to call adjust it until the kernel could have your configuration in a disallow list. Signed-off-by: Mario Limonciello Reviewed-by: Hans de Goede Tested-by: Yijun Shen Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/e1000.h | 1 + drivers/net/ethernet/intel/e1000e/ethtool.c | 46 +++++++++++++++++++++ drivers/net/ethernet/intel/e1000e/netdev.c | 9 ++-- 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index ba7a0f8f6937..5b2143f4b1f8 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -436,6 +436,7 @@ s32 e1000e_get_base_timinca(struct e1000_adapter *adapter, u32 *timinca); #define FLAG2_DFLT_CRC_STRIPPING BIT(12) #define FLAG2_CHECK_RX_HWTSTAMP BIT(13) #define FLAG2_CHECK_SYSTIM_OVERFLOW BIT(14) +#define FLAG2_ENABLE_S0IX_FLOWS BIT(15) #define E1000_RX_DESC_PS(R, i) \ (&(((union e1000_rx_desc_packet_split *)((R).desc))[i])) diff --git a/drivers/net/ethernet/intel/e1000e/ethtool.c b/drivers/net/ethernet/intel/e1000e/ethtool.c index 03215b0aee4b..06442e6bef73 100644 --- a/drivers/net/ethernet/intel/e1000e/ethtool.c +++ b/drivers/net/ethernet/intel/e1000e/ethtool.c @@ -23,6 +23,13 @@ struct e1000_stats { int stat_offset; }; +static const char e1000e_priv_flags_strings[][ETH_GSTRING_LEN] = { +#define E1000E_PRIV_FLAGS_S0IX_ENABLED BIT(0) + "s0ix-enabled", +}; + +#define E1000E_PRIV_FLAGS_STR_LEN ARRAY_SIZE(e1000e_priv_flags_strings) + #define E1000_STAT(str, m) { \ .stat_string = str, \ .type = E1000_STATS, \ @@ -1776,6 +1783,8 @@ static int e1000e_get_sset_count(struct net_device __always_unused *netdev, return E1000_TEST_LEN; case ETH_SS_STATS: return E1000_STATS_LEN; + case ETH_SS_PRIV_FLAGS: + return E1000E_PRIV_FLAGS_STR_LEN; default: return -EOPNOTSUPP; } @@ -2097,6 +2106,10 @@ static void e1000_get_strings(struct net_device __always_unused *netdev, p += ETH_GSTRING_LEN; } break; + case ETH_SS_PRIV_FLAGS: + memcpy(data, e1000e_priv_flags_strings, + E1000E_PRIV_FLAGS_STR_LEN * ETH_GSTRING_LEN); + break; } } @@ -2305,6 +2318,37 @@ static int e1000e_get_ts_info(struct net_device *netdev, return 0; } +static u32 e1000e_get_priv_flags(struct net_device *netdev) +{ + struct e1000_adapter *adapter = netdev_priv(netdev); + u32 priv_flags = 0; + + if (adapter->flags2 & FLAG2_ENABLE_S0IX_FLOWS) + priv_flags |= E1000E_PRIV_FLAGS_S0IX_ENABLED; + + return priv_flags; +} + +static int e1000e_set_priv_flags(struct net_device *netdev, u32 priv_flags) +{ + struct e1000_adapter *adapter = netdev_priv(netdev); + unsigned int flags2 = adapter->flags2; + + flags2 &= ~FLAG2_ENABLE_S0IX_FLOWS; + if (priv_flags & E1000E_PRIV_FLAGS_S0IX_ENABLED) { + struct e1000_hw *hw = &adapter->hw; + + if (hw->mac.type < e1000_pch_cnp) + return -EINVAL; + flags2 |= FLAG2_ENABLE_S0IX_FLOWS; + } + + if (flags2 != adapter->flags2) + adapter->flags2 = flags2; + + return 0; +} + static const struct ethtool_ops e1000_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS, .get_drvinfo = e1000_get_drvinfo, @@ -2336,6 +2380,8 @@ static const struct ethtool_ops e1000_ethtool_ops = { .set_eee = e1000e_set_eee, .get_link_ksettings = e1000_get_link_ksettings, .set_link_ksettings = e1000_set_link_ksettings, + .get_priv_flags = e1000e_get_priv_flags, + .set_priv_flags = e1000e_set_priv_flags, }; void e1000e_set_ethtool_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index b9800ba2006c..e9b82c209c2d 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -6923,7 +6923,6 @@ static __maybe_unused int e1000e_pm_suspend(struct device *dev) struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev)); struct e1000_adapter *adapter = netdev_priv(netdev); struct pci_dev *pdev = to_pci_dev(dev); - struct e1000_hw *hw = &adapter->hw; int rc; e1000e_flush_lpic(pdev); @@ -6935,7 +6934,7 @@ static __maybe_unused int e1000e_pm_suspend(struct device *dev) e1000e_pm_thaw(dev); } else { /* Introduce S0ix implementation */ - if (hw->mac.type >= e1000_pch_cnp) + if (adapter->flags2 & FLAG2_ENABLE_S0IX_FLOWS) e1000e_s0ix_entry_flow(adapter); } @@ -6947,11 +6946,10 @@ static __maybe_unused int e1000e_pm_resume(struct device *dev) struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev)); struct e1000_adapter *adapter = netdev_priv(netdev); struct pci_dev *pdev = to_pci_dev(dev); - struct e1000_hw *hw = &adapter->hw; int rc; /* Introduce S0ix implementation */ - if (hw->mac.type >= e1000_pch_cnp) + if (adapter->flags2 & FLAG2_ENABLE_S0IX_FLOWS) e1000e_s0ix_exit_flow(adapter); rc = __e1000_resume(pdev); @@ -7615,6 +7613,9 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (!(adapter->flags & FLAG_HAS_AMT)) e1000e_get_hw_control(adapter); + if (hw->mac.type >= e1000_pch_cnp) + adapter->flags2 |= FLAG2_ENABLE_S0IX_FLOWS; + strlcpy(netdev->name, "eth%d", sizeof(netdev->name)); err = register_netdev(netdev); if (err) From 11b844b0b7c7c3dc8e8f4d0bbaad5e798351862c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 23 Dec 2020 12:06:52 -0800 Subject: [PATCH 140/547] selftests/bpf: Work-around EBUSY errors from hashmap update/delete 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked") introduced a possibility of getting EBUSY error on lock contention, which seems to happen very deterministically in test_maps when running 1024 threads on low-CPU machine. In libbpf CI case, it's a 2 CPU VM and it's hitting this 100% of the time. Work around by retrying on EBUSY (and EAGAIN, while we are at it) after a small sleep. sched_yield() is too agressive and fails even after 20 retries, so I went with usleep(1) for backoff. Also log actual error returned to make it easier to see what's going on. Fixes: 20b6cc34ea74 ("bpf: Avoid hashtab deadlock with map_locked") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201223200652.3417075-1-andrii@kernel.org --- tools/testing/selftests/bpf/test_maps.c | 48 +++++++++++++++++++++---- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 0ad3e6305ff0..51adc42b2b40 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1312,22 +1312,58 @@ static void test_map_stress(void) #define DO_UPDATE 1 #define DO_DELETE 0 +#define MAP_RETRIES 20 + +static int map_update_retriable(int map_fd, const void *key, const void *value, + int flags, int attempts) +{ + while (bpf_map_update_elem(map_fd, key, value, flags)) { + if (!attempts || (errno != EAGAIN && errno != EBUSY)) + return -errno; + + usleep(1); + attempts--; + } + + return 0; +} + +static int map_delete_retriable(int map_fd, const void *key, int attempts) +{ + while (bpf_map_delete_elem(map_fd, key)) { + if (!attempts || (errno != EAGAIN && errno != EBUSY)) + return -errno; + + usleep(1); + attempts--; + } + + return 0; +} + static void test_update_delete(unsigned int fn, void *data) { int do_update = ((int *)data)[1]; int fd = ((int *)data)[0]; - int i, key, value; + int i, key, value, err; for (i = fn; i < MAP_SIZE; i += TASKS) { key = value = i; if (do_update) { - assert(bpf_map_update_elem(fd, &key, &value, - BPF_NOEXIST) == 0); - assert(bpf_map_update_elem(fd, &key, &value, - BPF_EXIST) == 0); + err = map_update_retriable(fd, &key, &value, BPF_NOEXIST, MAP_RETRIES); + if (err) + printf("error %d %d\n", err, errno); + assert(err == 0); + err = map_update_retriable(fd, &key, &value, BPF_EXIST, MAP_RETRIES); + if (err) + printf("error %d %d\n", err, errno); + assert(err == 0); } else { - assert(bpf_map_delete_elem(fd, &key) == 0); + err = map_delete_retriable(fd, &key, MAP_RETRIES); + if (err) + printf("error %d %d\n", err, errno); + assert(err == 0); } } } From 69ca310f34168eae0ada434796bfc22fb4a0fa26 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Fri, 18 Dec 2020 10:50:30 -0800 Subject: [PATCH 141/547] bpf: Save correct stopping point in file seq iteration On some systems, some variant of the following splat is repeatedly seen. The common factor in all traces seems to be the entry point to task_file_seq_next(). With the patch, all warnings go away. rcu: INFO: rcu_sched self-detected stall on CPU rcu: \x0926-....: (20992 ticks this GP) idle=d7e/1/0x4000000000000002 softirq=81556231/81556231 fqs=4876 \x09(t=21033 jiffies g=159148529 q=223125) NMI backtrace for cpu 26 CPU: 26 PID: 2015853 Comm: bpftool Kdump: loaded Not tainted 5.6.13-0_fbk4_3876_gd8d1f9bf80bb #1 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A12 10/08/2018 Call Trace: dump_stack+0x50/0x70 nmi_cpu_backtrace.cold.6+0x13/0x50 ? lapic_can_unplug_cpu.cold.30+0x40/0x40 nmi_trigger_cpumask_backtrace+0xba/0xca rcu_dump_cpu_stacks+0x99/0xc7 rcu_sched_clock_irq.cold.90+0x1b4/0x3aa ? tick_sched_do_timer+0x60/0x60 update_process_times+0x24/0x50 tick_sched_timer+0x37/0x70 __hrtimer_run_queues+0xfe/0x270 hrtimer_interrupt+0xf4/0x210 smp_apic_timer_interrupt+0x5e/0x120 apic_timer_interrupt+0xf/0x20 RIP: 0010:get_pid_task+0x38/0x80 Code: 89 f6 48 8d 44 f7 08 48 8b 00 48 85 c0 74 2b 48 83 c6 55 48 c1 e6 04 48 29 f0 74 19 48 8d 78 20 ba 01 00 00 00 f0 0f c1 50 20 <85> d2 74 27 78 11 83 c2 01 78 0c 48 83 c4 08 c3 31 c0 48 83 c4 08 RSP: 0018:ffffc9000d293dc8 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff13 RAX: ffff888637c05600 RBX: ffffc9000d293e0c RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000550 RDI: ffff888637c05620 RBP: ffffffff8284eb80 R08: ffff88831341d300 R09: ffff88822ffd8248 R10: ffff88822ffd82d0 R11: 00000000003a93c0 R12: 0000000000000001 R13: 00000000ffffffff R14: ffff88831341d300 R15: 0000000000000000 ? find_ge_pid+0x1b/0x20 task_seq_get_next+0x52/0xc0 task_file_seq_get_next+0x159/0x220 task_file_seq_next+0x4f/0xa0 bpf_seq_read+0x159/0x390 vfs_read+0x8a/0x140 ksys_read+0x59/0xd0 do_syscall_64+0x42/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f95ae73e76e Code: Bad RIP value. RSP: 002b:00007ffc02c1dbf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 000000000170faa0 RCX: 00007f95ae73e76e RDX: 0000000000001000 RSI: 00007ffc02c1dc30 RDI: 0000000000000007 RBP: 00007ffc02c1ec70 R08: 0000000000000005 R09: 0000000000000006 R10: fffffffffffff20b R11: 0000000000000246 R12: 00000000019112a0 R13: 0000000000000000 R14: 0000000000000007 R15: 00000000004283c0 If unable to obtain the file structure for the current task, proceed to the next task number after the one returned from task_seq_get_next(), instead of the next task number from the original iterator. Also, save the stopping task number from task_seq_get_next() on failure in case of restarts. Fixes: eaaacd23910f ("bpf: Add task and task/file iterator targets") Signed-off-by: Jonathan Lemon Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201218185032.2464558-2-jonathan.lemon@gmail.com --- kernel/bpf/task_iter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 0458a40edf10..8033ab19138a 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -158,13 +158,14 @@ again: if (!curr_task) { info->task = NULL; info->files = NULL; + info->tid = curr_tid; return NULL; } curr_files = get_files_struct(curr_task); if (!curr_files) { put_task_struct(curr_task); - curr_tid = ++(info->tid); + curr_tid = curr_tid + 1; info->fd = 0; goto again; } From a61daaf351da7c8493f2586437617d60c24350b0 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Fri, 18 Dec 2020 10:50:31 -0800 Subject: [PATCH 142/547] bpf: Use thread_group_leader() Instead of directly comparing task->tgid and task->pid, use the thread_group_leader() helper. This helps with readability, and there should be no functional change. Signed-off-by: Jonathan Lemon Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201218185032.2464558-3-jonathan.lemon@gmail.com --- kernel/bpf/task_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8033ab19138a..dc4007f1843b 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -37,7 +37,7 @@ retry: if (!task) { ++*tid; goto retry; - } else if (skip_if_dup_files && task->tgid != task->pid && + } else if (skip_if_dup_files && !thread_group_leader(task) && task->files == task->group_leader->files) { put_task_struct(task); task = NULL; From e13a6915a03ffc3ce332d28c141a335e25187fa3 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 23 Dec 2020 15:36:38 +0100 Subject: [PATCH 143/547] vhost/vsock: add IOTLB API support This patch enables the IOTLB API support for vhost-vsock devices, allowing the userspace to emulate an IOMMU for the guest. These changes were made following vhost-net, in details this patch: - exposes VIRTIO_F_ACCESS_PLATFORM feature and inits the iotlb device if the feature is acked - implements VHOST_GET_BACKEND_FEATURES and VHOST_SET_BACKEND_FEATURES ioctls - calls vq_meta_prefetch() before vq processing to prefetch vq metadata address in IOTLB - provides .read_iter, .write_iter, and .poll callbacks for the chardev; they are used by the userspace to exchange IOTLB messages This patch was tested specifying "intel_iommu=strict" in the guest kernel command line. I used QEMU with a patch applied [1] to fix a simple issue (that patch was merged in QEMU v5.2.0): $ qemu -M q35,accel=kvm,kernel-irqchip=split \ -drive file=fedora.qcow2,format=qcow2,if=virtio \ -device intel-iommu,intremap=on,device-iotlb=on \ -device vhost-vsock-pci,guest-cid=3,iommu_platform=on,ats=on [1] https://lists.gnu.org/archive/html/qemu-devel/2020-10/msg09077.html Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201223143638.123417-1-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vsock.c | 68 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index a483cec31d5c..5e78fb719602 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -30,7 +30,12 @@ #define VHOST_VSOCK_PKT_WEIGHT 256 enum { - VHOST_VSOCK_FEATURES = VHOST_FEATURES, + VHOST_VSOCK_FEATURES = VHOST_FEATURES | + (1ULL << VIRTIO_F_ACCESS_PLATFORM) +}; + +enum { + VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; /* Used to track all the vhost_vsock instances on the system. */ @@ -94,6 +99,9 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, if (!vhost_vq_get_backend(vq)) goto out; + if (!vq_meta_prefetch(vq)) + goto out; + /* Avoid further vmexits, we're already processing the virtqueue */ vhost_disable_notify(&vsock->dev, vq); @@ -449,6 +457,9 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) if (!vhost_vq_get_backend(vq)) goto out; + if (!vq_meta_prefetch(vq)) + goto out; + vhost_disable_notify(&vsock->dev, vq); do { u32 len; @@ -766,8 +777,12 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) mutex_lock(&vsock->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && !vhost_log_access_ok(&vsock->dev)) { - mutex_unlock(&vsock->dev.mutex); - return -EFAULT; + goto err; + } + + if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { + if (vhost_init_device_iotlb(&vsock->dev, true)) + goto err; } for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { @@ -778,6 +793,10 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) } mutex_unlock(&vsock->dev.mutex); return 0; + +err: + mutex_unlock(&vsock->dev.mutex); + return -EFAULT; } static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, @@ -811,6 +830,18 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, if (copy_from_user(&features, argp, sizeof(features))) return -EFAULT; return vhost_vsock_set_features(vsock, features); + case VHOST_GET_BACKEND_FEATURES: + features = VHOST_VSOCK_BACKEND_FEATURES; + if (copy_to_user(argp, &features, sizeof(features))) + return -EFAULT; + return 0; + case VHOST_SET_BACKEND_FEATURES: + if (copy_from_user(&features, argp, sizeof(features))) + return -EFAULT; + if (features & ~VHOST_VSOCK_BACKEND_FEATURES) + return -EOPNOTSUPP; + vhost_set_backend_features(&vsock->dev, features); + return 0; default: mutex_lock(&vsock->dev.mutex); r = vhost_dev_ioctl(&vsock->dev, ioctl, argp); @@ -823,6 +854,34 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, } } +static ssize_t vhost_vsock_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct vhost_vsock *vsock = file->private_data; + struct vhost_dev *dev = &vsock->dev; + int noblock = file->f_flags & O_NONBLOCK; + + return vhost_chr_read_iter(dev, to, noblock); +} + +static ssize_t vhost_vsock_chr_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct vhost_vsock *vsock = file->private_data; + struct vhost_dev *dev = &vsock->dev; + + return vhost_chr_write_iter(dev, from); +} + +static __poll_t vhost_vsock_chr_poll(struct file *file, poll_table *wait) +{ + struct vhost_vsock *vsock = file->private_data; + struct vhost_dev *dev = &vsock->dev; + + return vhost_chr_poll(file, dev, wait); +} + static const struct file_operations vhost_vsock_fops = { .owner = THIS_MODULE, .open = vhost_vsock_dev_open, @@ -830,6 +889,9 @@ static const struct file_operations vhost_vsock_fops = { .llseek = noop_llseek, .unlocked_ioctl = vhost_vsock_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, + .read_iter = vhost_vsock_chr_read_iter, + .write_iter = vhost_vsock_chr_write_iter, + .poll = vhost_vsock_chr_poll, }; static struct miscdevice vhost_vsock_misc = { From 6cb56218ad9e580e519dcd23bfb3db08d8692e5a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Dec 2020 23:23:56 +0100 Subject: [PATCH 144/547] netfilter: xt_RATEEST: reject non-null terminated string from userspace syzbot reports: detected buffer overflow in strlen [..] Call Trace: strlen include/linux/string.h:325 [inline] strlcpy include/linux/string.h:348 [inline] xt_rateest_tg_checkentry+0x2a5/0x6b0 net/netfilter/xt_RATEEST.c:143 strlcpy assumes src is a c-string. Check info->name before its used. Reported-by: syzbot+e86f7c428c8c50db65b4@syzkaller.appspotmail.com Fixes: 5859034d7eb8793 ("[NETFILTER]: x_tables: add RATEEST target") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_RATEEST.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 37253d399c6b..0d5c422f8745 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -115,6 +115,9 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) } cfg; int ret; + if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) + return -ENAMETOOLONG; + net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); From 9e5c23b9bd71d00b07720b2a8037b019d356e9df Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 1 Dec 2020 15:01:56 +0000 Subject: [PATCH 145/547] KVM: arm64: Update comment in kvm_vgic_map_resources() vgic_v3_map_resources() returns -EBUSY if the VGIC isn't initialized, update the comment to kvm_vgic_map_resources() to match what the function does. Signed-off-by: Alexandru Elisei Reviewed-by: Eric Auger Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201201150157.223625-5-alexandru.elisei@arm.com --- arch/arm64/kvm/vgic/vgic-init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index a2f4d1c85f00..5b54787a9ad5 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -419,7 +419,8 @@ int vgic_lazy_init(struct kvm *kvm) * Map the MMIO regions depending on the VGIC model exposed to the guest * called on the first VCPU run. * Also map the virtual CPU interface into the VM. - * v2/v3 derivatives call vgic_init if not already done. + * v2 calls vgic_init() if not already done. + * v3 and derivatives return an error if the VGIC is not initialized. * vgic_ready() returns true if this function has succeeded. * @kvm: kvm struct pointer */ From 282ff80135717cc43f1e33ddd4b0cd9e760d060b Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Tue, 1 Dec 2020 15:01:57 +0000 Subject: [PATCH 146/547] KVM: arm64: Remove redundant call to kvm_pmu_vcpu_reset() KVM_ARM_VCPU_INIT ioctl calls kvm_reset_vcpu(), which in turn resets the PMU with a call to kvm_pmu_vcpu_reset(). The function zeroes the PMU chained counters bitmap and stops all the counters with a perf event attached. Because it is called before the VCPU has had the chance to run, no perf events are in use and none are released. kvm_arm_pmu_v3_enable(), called by kvm_vcpu_first_run_init() only if the VCPU has been initialized, also resets the PMU. kvm_pmu_vcpu_reset() in this case does the exact same thing as the previous call, so remove it. Signed-off-by: Alexandru Elisei Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201201150157.223625-6-alexandru.elisei@arm.com --- arch/arm64/kvm/pmu-emul.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 398f6df1bbe4..4ad66a532e38 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -850,8 +850,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) return -EINVAL; } - kvm_pmu_vcpu_reset(vcpu); - return 0; } From 101068b566ef227b605d807aad9e72efd8b6bc5b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 27 Dec 2020 14:28:34 +0000 Subject: [PATCH 147/547] KVM: arm64: Consolidate dist->ready setting into kvm_vgic_map_resources() dist->ready setting is pointlessly spread across the two vgic backends, while it could be consolidated in kvm_vgic_map_resources(). Move it there, and slightly simplify the flows in both backends. Suggested-by: Eric Auger Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-init.c | 2 ++ arch/arm64/kvm/vgic/vgic-v2.c | 17 ++++++----------- arch/arm64/kvm/vgic/vgic-v3.c | 18 ++++++------------ 3 files changed, 14 insertions(+), 23 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 5b54787a9ad5..052917deb149 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -446,6 +446,8 @@ int kvm_vgic_map_resources(struct kvm *kvm) if (ret) __kvm_vgic_destroy(kvm); + else + dist->ready = true; out: mutex_unlock(&kvm->lock); diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 7f38c1a93639..11934c2af2f4 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -309,14 +309,12 @@ int vgic_v2_map_resources(struct kvm *kvm) if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) || IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) { kvm_err("Need to set vgic cpu and dist addresses first\n"); - ret = -ENXIO; - goto out; + return -ENXIO; } if (!vgic_v2_check_base(dist->vgic_dist_base, dist->vgic_cpu_base)) { kvm_err("VGIC CPU and dist frames overlap\n"); - ret = -EINVAL; - goto out; + return -EINVAL; } /* @@ -326,13 +324,13 @@ int vgic_v2_map_resources(struct kvm *kvm) ret = vgic_init(kvm); if (ret) { kvm_err("Unable to initialize VGIC dynamic data structures\n"); - goto out; + return ret; } ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V2); if (ret) { kvm_err("Unable to register VGIC MMIO regions\n"); - goto out; + return ret; } if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) { @@ -341,14 +339,11 @@ int vgic_v2_map_resources(struct kvm *kvm) KVM_VGIC_V2_CPU_SIZE, true); if (ret) { kvm_err("Unable to remap VGIC CPU to VCPU\n"); - goto out; + return ret; } } - dist->ready = true; - -out: - return ret; + return 0; } DEFINE_STATIC_KEY_FALSE(vgic_v2_cpuif_trap); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 35029c5cb0f1..52915b342351 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -505,21 +505,18 @@ int vgic_v3_map_resources(struct kvm *kvm) if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) { kvm_debug("vcpu %d redistributor base not set\n", c); - ret = -ENXIO; - goto out; + return -ENXIO; } } if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) { kvm_err("Need to set vgic distributor addresses first\n"); - ret = -ENXIO; - goto out; + return -ENXIO; } if (!vgic_v3_check_base(kvm)) { kvm_err("VGIC redist and dist frames overlap\n"); - ret = -EINVAL; - goto out; + return -EINVAL; } /* @@ -527,22 +524,19 @@ int vgic_v3_map_resources(struct kvm *kvm) * the VGIC before we need to use it. */ if (!vgic_initialized(kvm)) { - ret = -EBUSY; - goto out; + return -EBUSY; } ret = vgic_register_dist_iodev(kvm, dist->vgic_dist_base, VGIC_V3); if (ret) { kvm_err("Unable to register VGICv3 dist MMIO regions\n"); - goto out; + return ret; } if (kvm_vgic_global_state.has_gicv4_1) vgic_v4_configure_vsgis(kvm); - dist->ready = true; -out: - return ret; + return 0; } DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap); From 976509bb310b913d30577f15b58bdd30effb0542 Mon Sep 17 00:00:00 2001 From: Quanyang Wang Date: Thu, 24 Dec 2020 18:49:27 +0800 Subject: [PATCH 148/547] opp: fix memory leak in _allocate_opp_table In function _allocate_opp_table, opp_dev is allocated and referenced by opp_table via _add_opp_dev. But in the case that the subsequent calls return -EPROBE_DEFER, it will jump to err label and opp_table will be freed. Then opp_dev becomes an unreferenced object to cause memory leak. So let's call _remove_opp_dev to do the cleanup. This fixes the following kmemleak report: unreferenced object 0xffff000801524a00 (size 128): comm "swapper/0", pid 1, jiffies 4294892465 (age 84.616s) hex dump (first 32 bytes): 40 00 56 01 08 00 ff ff 40 00 56 01 08 00 ff ff @.V.....@.V..... b8 52 77 7f 08 00 ff ff 00 3c 4c 00 08 00 ff ff .Rw......] kmemleak_alloc+0x30/0x40 [<0000000056da48f0>] kmem_cache_alloc+0x3d4/0x588 [<00000000a84b3b0e>] _add_opp_dev+0x2c/0x88 [<0000000062a380cd>] _add_opp_table_indexed+0x124/0x268 [<000000008b4c8f1f>] dev_pm_opp_of_add_table+0x20/0x1d8 [<00000000e5316798>] dev_pm_opp_of_cpumask_add_table+0x48/0xf0 [<00000000db0a8ec2>] dt_cpufreq_probe+0x20c/0x448 [<0000000030a3a26c>] platform_probe+0x68/0xd8 [<00000000c618e78d>] really_probe+0xd0/0x3a0 [<00000000642e856f>] driver_probe_device+0x58/0xb8 [<00000000f10f5307>] device_driver_attach+0x74/0x80 [<0000000004f254b8>] __driver_attach+0x58/0xe0 [<0000000009d5d19e>] bus_for_each_dev+0x70/0xc8 [<0000000000d22e1c>] driver_attach+0x24/0x30 [<0000000001d4e952>] bus_add_driver+0x14c/0x1f0 [<0000000089928aaa>] driver_register+0x64/0x120 Cc: v5.10 # v5.10 Fixes: dd461cd9183f ("opp: Allow dev_pm_opp_get_opp_table() to return -EPROBE_DEFER") Signed-off-by: Quanyang Wang [ Viresh: Added the stable tag ] Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 4268eb359915..c9e50836b4c2 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1092,7 +1092,7 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) if (IS_ERR(opp_table->clk)) { ret = PTR_ERR(opp_table->clk); if (ret == -EPROBE_DEFER) - goto err; + goto remove_opp_dev; dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__, ret); } @@ -1101,7 +1101,7 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) ret = dev_pm_opp_of_find_icc_paths(dev, opp_table); if (ret) { if (ret == -EPROBE_DEFER) - goto err; + goto remove_opp_dev; dev_warn(dev, "%s: Error finding interconnect paths: %d\n", __func__, ret); @@ -1113,6 +1113,8 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) return opp_table; +remove_opp_dev: + _remove_opp_dev(opp_dev, opp_table); err: kfree(opp_table); return ERR_PTR(ret); From 0e1d9ca1766f5d95fb881f57b6c4a1ffa63d4648 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 28 Dec 2020 10:51:04 +0530 Subject: [PATCH 149/547] opp: Call the missing clk_put() on error Fix the clock reference counting by calling the missing clk_put() in the error path. Cc: v5.10 # v5.10 Fixes: dd461cd9183f ("opp: Allow dev_pm_opp_get_opp_table() to return -EPROBE_DEFER") Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index c9e50836b4c2..8c905aabacc0 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1101,7 +1101,7 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) ret = dev_pm_opp_of_find_icc_paths(dev, opp_table); if (ret) { if (ret == -EPROBE_DEFER) - goto remove_opp_dev; + goto put_clk; dev_warn(dev, "%s: Error finding interconnect paths: %d\n", __func__, ret); @@ -1113,6 +1113,9 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) return opp_table; +put_clk: + if (!IS_ERR(opp_table->clk)) + clk_put(opp_table->clk); remove_opp_dev: _remove_opp_dev(opp_dev, opp_table); err: From 105b5ca9b1e38a8db8446a493ca062eea98171eb Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 30 Nov 2020 16:36:30 +0200 Subject: [PATCH 150/547] habanalabs: Fix a missing-braces warning Fix a compilation "missing braces around initializer" warning. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 32e6af1db4e3..a9927a06c5ea 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -406,7 +406,7 @@ static int total_energy_consumption_info(struct hl_fpriv *hpriv, static int pll_frequency_info(struct hl_fpriv *hpriv, struct hl_info_args *args) { struct hl_device *hdev = hpriv->hdev; - struct hl_pll_frequency_info freq_info = {0}; + struct hl_pll_frequency_info freq_info = {}; u32 max_size = args->return_size; void __user *out = (void __user *) (uintptr_t) args->return_pointer; int rc; From 429f1571e8f0b14ec42b8fb14efcfc0576b2788f Mon Sep 17 00:00:00 2001 From: Alon Mizrahi Date: Tue, 1 Dec 2020 18:44:11 +0200 Subject: [PATCH 151/547] habanalabs: add comment for pll frequency ioctl opcode Forgot to add the comment for the opcode when it was added. Signed-off-by: Alon Mizrahi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/uapi/misc/habanalabs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 8c15a7d336a0..dc8bcec195cc 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -279,6 +279,7 @@ enum hl_device_status { * HL_INFO_CLK_THROTTLE_REASON - Retrieve clock throttling reason * HL_INFO_SYNC_MANAGER - Retrieve sync manager info per dcore * HL_INFO_TOTAL_ENERGY - Retrieve total energy consumption + * HL_INFO_PLL_FREQUENCY - Retrieve PLL frequency */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 From 4783489951b78525a6e61b43936cbbd88b7938af Mon Sep 17 00:00:00 2001 From: Alon Mizrahi Date: Thu, 26 Nov 2020 13:05:20 +0200 Subject: [PATCH 152/547] habanalabs: fetch PSOC PLL frequency from F/W in goya When the F/W security is enabled, goya needs to fetch the PSOC pll frequency through a dedicated interface Signed-off-by: Alon Mizrahi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 59 ++++++++++++++++++----------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 3e5eb9e3d7bd..b66fd55accb5 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -694,32 +694,47 @@ static void goya_qman0_set_security(struct hl_device *hdev, bool secure) static void goya_fetch_psoc_frequency(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; - u32 trace_freq = 0; - u32 pll_clk = 0; - u32 div_fctr = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1); - u32 div_sel = RREG32(mmPSOC_PCI_PLL_DIV_SEL_1); - u32 nr = RREG32(mmPSOC_PCI_PLL_NR); - u32 nf = RREG32(mmPSOC_PCI_PLL_NF); - u32 od = RREG32(mmPSOC_PCI_PLL_OD); + u32 nr = 0, nf = 0, od = 0, div_fctr = 0, pll_clk, div_sel; + u16 pll_freq_arr[HL_PLL_NUM_OUTPUTS], freq; + int rc; - if (div_sel == DIV_SEL_REF_CLK || div_sel == DIV_SEL_DIVIDED_REF) { - if (div_sel == DIV_SEL_REF_CLK) - trace_freq = PLL_REF_CLK; - else - trace_freq = PLL_REF_CLK / (div_fctr + 1); - } else if (div_sel == DIV_SEL_PLL_CLK || - div_sel == DIV_SEL_DIVIDED_PLL) { - pll_clk = PLL_REF_CLK * (nf + 1) / ((nr + 1) * (od + 1)); - if (div_sel == DIV_SEL_PLL_CLK) - trace_freq = pll_clk; - else - trace_freq = pll_clk / (div_fctr + 1); + if (hdev->asic_prop.fw_security_disabled) { + div_fctr = RREG32(mmPSOC_PCI_PLL_DIV_FACTOR_1); + div_sel = RREG32(mmPSOC_PCI_PLL_DIV_SEL_1); + nr = RREG32(mmPSOC_PCI_PLL_NR); + nf = RREG32(mmPSOC_PCI_PLL_NF); + od = RREG32(mmPSOC_PCI_PLL_OD); + + if (div_sel == DIV_SEL_REF_CLK || + div_sel == DIV_SEL_DIVIDED_REF) { + if (div_sel == DIV_SEL_REF_CLK) + freq = PLL_REF_CLK; + else + freq = PLL_REF_CLK / (div_fctr + 1); + } else if (div_sel == DIV_SEL_PLL_CLK || + div_sel == DIV_SEL_DIVIDED_PLL) { + pll_clk = PLL_REF_CLK * (nf + 1) / + ((nr + 1) * (od + 1)); + if (div_sel == DIV_SEL_PLL_CLK) + freq = pll_clk; + else + freq = pll_clk / (div_fctr + 1); + } else { + dev_warn(hdev->dev, + "Received invalid div select value: %d", + div_sel); + freq = 0; + } } else { - dev_warn(hdev->dev, - "Received invalid div select value: %d", div_sel); + rc = hl_fw_cpucp_pll_info_get(hdev, PCI_PLL, pll_freq_arr); + + if (rc) + return; + + freq = pll_freq_arr[1]; } - prop->psoc_timestamp_frequency = trace_freq; + prop->psoc_timestamp_frequency = freq; prop->psoc_pci_pll_nr = nr; prop->psoc_pci_pll_nf = nf; prop->psoc_pci_pll_od = od; From 6585489e808d9964dbde9dad89ac8e792e1185fc Mon Sep 17 00:00:00 2001 From: Alon Mizrahi Date: Thu, 19 Nov 2020 16:34:19 +0200 Subject: [PATCH 153/547] habanalabs: remove generic gaudi get_pll_freq function As we only fetch the CPU_PLL frequency in gaudi, we don't need a generic get_pll_frequency function which takes a pll index as input Signed-off-by: Alon Mizrahi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/habanalabs_ioctl.c | 2 +- drivers/misc/habanalabs/gaudi/gaudi.c | 132 ++++++------------ drivers/misc/habanalabs/gaudi/gaudiP.h | 7 - 3 files changed, 44 insertions(+), 97 deletions(-) diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index a9927a06c5ea..a0c0d20f6f8f 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -406,7 +406,7 @@ static int total_energy_consumption_info(struct hl_fpriv *hpriv, static int pll_frequency_info(struct hl_fpriv *hpriv, struct hl_info_args *args) { struct hl_device *hdev = hpriv->hdev; - struct hl_pll_frequency_info freq_info = {}; + struct hl_pll_frequency_info freq_info = { {0} }; u32 max_size = args->return_size; void __user *out = (void __user *) (uintptr_t) args->return_pointer; int rc; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 1f1926607c5e..278c4de98e22 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -151,19 +151,6 @@ static const u16 gaudi_packet_sizes[MAX_PACKET_ID] = { [PACKET_LOAD_AND_EXE] = sizeof(struct packet_load_and_exe) }; -static const u32 gaudi_pll_base_addresses[GAUDI_PLL_MAX] = { - [CPU_PLL] = mmPSOC_CPU_PLL_NR, - [PCI_PLL] = mmPSOC_PCI_PLL_NR, - [SRAM_PLL] = mmSRAM_W_PLL_NR, - [HBM_PLL] = mmPSOC_HBM_PLL_NR, - [NIC_PLL] = mmNIC0_PLL_NR, - [DMA_PLL] = mmDMA_W_PLL_NR, - [MESH_PLL] = mmMESH_W_PLL_NR, - [MME_PLL] = mmPSOC_MME_PLL_NR, - [TPC_PLL] = mmPSOC_TPC_PLL_NR, - [IF_PLL] = mmIF_W_PLL_NR -}; - static inline bool validate_packet_id(enum packet_id id) { switch (id) { @@ -702,73 +689,6 @@ static int gaudi_early_fini(struct hl_device *hdev) return 0; } -/** - * gaudi_fetch_pll_frequency - Fetch PLL frequency values - * - * @hdev: pointer to hl_device structure - * @pll_index: index of the pll to fetch frequency from - * @pll_freq: pointer to store the pll frequency in MHz in each of the available - * outputs. if a certain output is not available a 0 will be set - * - */ -static int gaudi_fetch_pll_frequency(struct hl_device *hdev, - enum gaudi_pll_index pll_index, - u16 *pll_freq_arr) -{ - u32 nr = 0, nf = 0, od = 0, pll_clk = 0, div_fctr, div_sel, - pll_base_addr = gaudi_pll_base_addresses[pll_index]; - u16 freq = 0; - int i, rc; - - if (hdev->asic_prop.fw_security_status_valid && - (hdev->asic_prop.fw_app_security_map & - CPU_BOOT_DEV_STS0_PLL_INFO_EN)) { - rc = hl_fw_cpucp_pll_info_get(hdev, pll_index, pll_freq_arr); - - if (rc) - return rc; - } else if (hdev->asic_prop.fw_security_disabled) { - /* Backward compatibility */ - nr = RREG32(pll_base_addr + PLL_NR_OFFSET); - nf = RREG32(pll_base_addr + PLL_NF_OFFSET); - od = RREG32(pll_base_addr + PLL_OD_OFFSET); - - for (i = 0; i < HL_PLL_NUM_OUTPUTS; i++) { - div_fctr = RREG32(pll_base_addr + - PLL_DIV_FACTOR_0_OFFSET + i * 4); - div_sel = RREG32(pll_base_addr + - PLL_DIV_SEL_0_OFFSET + i * 4); - - if (div_sel == DIV_SEL_REF_CLK || - div_sel == DIV_SEL_DIVIDED_REF) { - if (div_sel == DIV_SEL_REF_CLK) - freq = PLL_REF_CLK; - else - freq = PLL_REF_CLK / (div_fctr + 1); - } else if (div_sel == DIV_SEL_PLL_CLK || - div_sel == DIV_SEL_DIVIDED_PLL) { - pll_clk = PLL_REF_CLK * (nf + 1) / - ((nr + 1) * (od + 1)); - if (div_sel == DIV_SEL_PLL_CLK) - freq = pll_clk; - else - freq = pll_clk / (div_fctr + 1); - } else { - dev_warn(hdev->dev, - "Received invalid div select value: %d", - div_sel); - } - - pll_freq_arr[i] = freq; - } - } else { - dev_err(hdev->dev, "Failed to fetch PLL frequency values\n"); - return -EIO; - } - - return 0; -} - /** * gaudi_fetch_psoc_frequency - Fetch PSOC frequency values * @@ -778,18 +698,52 @@ static int gaudi_fetch_pll_frequency(struct hl_device *hdev, static int gaudi_fetch_psoc_frequency(struct hl_device *hdev) { struct asic_fixed_properties *prop = &hdev->asic_prop; - u16 pll_freq[HL_PLL_NUM_OUTPUTS]; + u32 nr = 0, nf = 0, od = 0, div_fctr = 0, pll_clk, div_sel; + u16 pll_freq_arr[HL_PLL_NUM_OUTPUTS], freq; int rc; - rc = gaudi_fetch_pll_frequency(hdev, CPU_PLL, pll_freq); - if (rc) - return rc; + if (hdev->asic_prop.fw_security_disabled) { + /* Backward compatibility */ + div_fctr = RREG32(mmPSOC_CPU_PLL_DIV_FACTOR_2); + div_sel = RREG32(mmPSOC_CPU_PLL_DIV_SEL_2); + nr = RREG32(mmPSOC_CPU_PLL_NR); + nf = RREG32(mmPSOC_CPU_PLL_NF); + od = RREG32(mmPSOC_CPU_PLL_OD); - prop->psoc_timestamp_frequency = pll_freq[2]; - prop->psoc_pci_pll_nr = 0; - prop->psoc_pci_pll_nf = 0; - prop->psoc_pci_pll_od = 0; - prop->psoc_pci_pll_div_factor = 0; + if (div_sel == DIV_SEL_REF_CLK || + div_sel == DIV_SEL_DIVIDED_REF) { + if (div_sel == DIV_SEL_REF_CLK) + freq = PLL_REF_CLK; + else + freq = PLL_REF_CLK / (div_fctr + 1); + } else if (div_sel == DIV_SEL_PLL_CLK || + div_sel == DIV_SEL_DIVIDED_PLL) { + pll_clk = PLL_REF_CLK * (nf + 1) / + ((nr + 1) * (od + 1)); + if (div_sel == DIV_SEL_PLL_CLK) + freq = pll_clk; + else + freq = pll_clk / (div_fctr + 1); + } else { + dev_warn(hdev->dev, + "Received invalid div select value: %d", + div_sel); + freq = 0; + } + } else { + rc = hl_fw_cpucp_pll_info_get(hdev, CPU_PLL, pll_freq_arr); + + if (rc) + return rc; + + freq = pll_freq_arr[2]; + } + + prop->psoc_timestamp_frequency = freq; + prop->psoc_pci_pll_nr = nr; + prop->psoc_pci_pll_nf = nf; + prop->psoc_pci_pll_od = od; + prop->psoc_pci_pll_div_factor = div_fctr; return 0; } diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h index f2d91f4fcffe..a7ab2d7e57d4 100644 --- a/drivers/misc/habanalabs/gaudi/gaudiP.h +++ b/drivers/misc/habanalabs/gaudi/gaudiP.h @@ -105,13 +105,6 @@ #define MME_ACC_OFFSET (mmMME1_ACC_BASE - mmMME0_ACC_BASE) #define SRAM_BANK_OFFSET (mmSRAM_Y0_X1_RTR_BASE - mmSRAM_Y0_X0_RTR_BASE) -#define PLL_NR_OFFSET 0 -#define PLL_NF_OFFSET (mmPSOC_CPU_PLL_NF - mmPSOC_CPU_PLL_NR) -#define PLL_OD_OFFSET (mmPSOC_CPU_PLL_OD - mmPSOC_CPU_PLL_NR) -#define PLL_DIV_FACTOR_0_OFFSET (mmPSOC_CPU_PLL_DIV_FACTOR_0 - \ - mmPSOC_CPU_PLL_NR) -#define PLL_DIV_SEL_0_OFFSET (mmPSOC_CPU_PLL_DIV_SEL_0 - mmPSOC_CPU_PLL_NR) - #define NUM_OF_SOB_IN_BLOCK \ (((mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_2047 - \ mmSYNC_MNGR_E_N_SYNC_MNGR_OBJS_SOB_OBJ_0) + 4) >> 2) From 9c9013cbd8338ff8eac732d115c9005bc512cbc5 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 1 Dec 2020 10:39:54 +0200 Subject: [PATCH 154/547] habanalabs: preboot hard reset support FW hard reset capability indication is now moved to preboot stage. Driver will check if HW is dirty only after it validated preboot is up. If HW is dirty, driver will perform a hard reset according to the FW capability. In addition, FW defines a new message which driver need to send in order to initiate a hard reset. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 25 ++++++++++--------- drivers/misc/habanalabs/gaudi/gaudi.c | 17 +++++++------ drivers/misc/habanalabs/goya/goya.c | 12 ++++----- .../habanalabs/include/common/hl_boot_if.h | 2 ++ 4 files changed, 31 insertions(+), 25 deletions(-) diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 0e1c629e9800..c970bfc6db66 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -627,7 +627,9 @@ int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg, security_status = RREG32(cpu_security_boot_status_reg); /* We read security status multiple times during boot: - * 1. preboot - we check if fw security feature is supported + * 1. preboot - a. Check whether the security status bits are valid + * b. Check whether fw security is enabled + * c. Check whether hard reset is done by fw * 2. boot cpu - we get boot cpu security status * 3. FW application - we get FW application security status * @@ -637,13 +639,20 @@ int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg, */ if (security_status & CPU_BOOT_DEV_STS0_ENABLED) { hdev->asic_prop.fw_security_status_valid = 1; - prop->fw_security_disabled = - !(security_status & CPU_BOOT_DEV_STS0_SECURITY_EN); + + if (!(security_status & CPU_BOOT_DEV_STS0_SECURITY_EN)) + prop->fw_security_disabled = true; + + if (security_status & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN) + hdev->asic_prop.hard_reset_done_by_fw = true; } else { hdev->asic_prop.fw_security_status_valid = 0; prop->fw_security_disabled = true; } + dev_dbg(hdev->dev, "Firmware hard-reset is %s\n", + hdev->asic_prop.hard_reset_done_by_fw ? "enabled" : "disabled"); + dev_info(hdev->dev, "firmware-level security is %s\n", prop->fw_security_disabled ? "disabled" : "enabled"); @@ -797,18 +806,10 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, } /* Read FW application security bits */ - if (hdev->asic_prop.fw_security_status_valid) { + if (hdev->asic_prop.fw_security_status_valid) hdev->asic_prop.fw_app_security_map = RREG32(cpu_security_boot_status_reg); - if (hdev->asic_prop.fw_app_security_map & - CPU_BOOT_DEV_STS0_FW_HARD_RST_EN) - hdev->asic_prop.hard_reset_done_by_fw = true; - } - - dev_dbg(hdev->dev, "Firmware hard-reset is %s\n", - hdev->asic_prop.hard_reset_done_by_fw ? "enabled" : "disabled"); - dev_info(hdev->dev, "Successfully loaded firmware to device\n"); out: diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 278c4de98e22..e465c158eaeb 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -654,12 +654,6 @@ static int gaudi_early_init(struct hl_device *hdev) if (rc) goto free_queue_props; - if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { - dev_info(hdev->dev, - "H/W state is dirty, must reset before initializing\n"); - hdev->asic_funcs->hw_fini(hdev, true); - } - /* Before continuing in the initialization, we need to read the preboot * version to determine whether we run with a security-enabled firmware */ @@ -672,6 +666,12 @@ static int gaudi_early_init(struct hl_device *hdev) goto pci_fini; } + if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { + dev_info(hdev->dev, + "H/W state is dirty, must reset before initializing\n"); + hdev->asic_funcs->hw_fini(hdev, true); + } + return 0; pci_fini: @@ -3881,7 +3881,10 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset) /* I don't know what is the state of the CPU so make sure it is * stopped in any means necessary */ - WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE); + if (hdev->asic_prop.hard_reset_done_by_fw) + WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_RST_DEV); + else + WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE); WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, GAUDI_EVENT_HALT_MACHINE); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index b66fd55accb5..d61177bf36a5 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -613,12 +613,6 @@ static int goya_early_init(struct hl_device *hdev) if (rc) goto free_queue_props; - if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { - dev_info(hdev->dev, - "H/W state is dirty, must reset before initializing\n"); - hdev->asic_funcs->hw_fini(hdev, true); - } - /* Before continuing in the initialization, we need to read the preboot * version to determine whether we run with a security-enabled firmware */ @@ -631,6 +625,12 @@ static int goya_early_init(struct hl_device *hdev) goto pci_fini; } + if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { + dev_info(hdev->dev, + "H/W state is dirty, must reset before initializing\n"); + hdev->asic_funcs->hw_fini(hdev, true); + } + if (!hdev->pldm) { val = RREG32(mmPSOC_GLOBAL_CONF_BOOT_STRAP_PINS); if (val & PSOC_GLOBAL_CONF_BOOT_STRAP_PINS_SRIOV_EN_MASK) diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h index e5801ecf0cb2..755c4800f002 100644 --- a/drivers/misc/habanalabs/include/common/hl_boot_if.h +++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h @@ -204,6 +204,8 @@ enum kmd_msg { KMD_MSG_GOTO_WFE, KMD_MSG_FIT_RDY, KMD_MSG_SKIP_BMC, + RESERVED, + KMD_MSG_RST_DEV, }; enum cpu_msg_status { From 72ab9ca52de6856380c26b2045aa826ae4308b76 Mon Sep 17 00:00:00 2001 From: Alon Mizrahi Date: Wed, 2 Dec 2020 19:55:30 +0200 Subject: [PATCH 155/547] habanalabs/gaudi: do not set EB in collective slave queues We don't need to set EB on signal packets from collective slave queues as it degrades performance. Because the slaves are the network queues, the engine barrier doesn't actually guarantee that the packet has been sent. Signed-off-by: Alon Mizrahi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs.h | 2 +- drivers/misc/habanalabs/common/hw_queue.c | 5 ++++- drivers/misc/habanalabs/gaudi/gaudi.c | 8 ++++---- drivers/misc/habanalabs/goya/goya.c | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 571eda6ef5ab..70b778a0d60e 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -944,7 +944,7 @@ struct hl_asic_funcs { u32 (*get_signal_cb_size)(struct hl_device *hdev); u32 (*get_wait_cb_size)(struct hl_device *hdev); u32 (*gen_signal_cb)(struct hl_device *hdev, void *data, u16 sob_id, - u32 size); + u32 size, bool eb); u32 (*gen_wait_cb)(struct hl_device *hdev, struct hl_gen_wait_properties *prop); void (*reset_sob)(struct hl_device *hdev, void *data); diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c index 7caf868d1585..76217258780a 100644 --- a/drivers/misc/habanalabs/common/hw_queue.c +++ b/drivers/misc/habanalabs/common/hw_queue.c @@ -418,8 +418,11 @@ static void init_signal_cs(struct hl_device *hdev, "generate signal CB, sob_id: %d, sob val: 0x%x, q_idx: %d\n", cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx); + /* we set an EB since we must make sure all oeprations are done + * when sending the signal + */ hdev->asic_funcs->gen_signal_cb(hdev, job->patched_cb, - cs_cmpl->hw_sob->sob_id, 0); + cs_cmpl->hw_sob->sob_id, 0, true); kref_get(&hw_sob->kref); diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index e465c158eaeb..65895ba075fe 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -361,7 +361,7 @@ static int gaudi_cpucp_info_get(struct hl_device *hdev); static void gaudi_disable_clock_gating(struct hl_device *hdev); static void gaudi_mmu_prepare(struct hl_device *hdev, u32 asid); static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id, - u32 size); + u32 size, bool eb); static u32 gaudi_gen_wait_cb(struct hl_device *hdev, struct hl_gen_wait_properties *prop); @@ -1064,7 +1064,7 @@ static void gaudi_collective_slave_init_job(struct hl_device *hdev, prop->collective_sob_id, queue_id); cb_size += gaudi_gen_signal_cb(hdev, job->user_cb, - prop->collective_sob_id, cb_size); + prop->collective_sob_id, cb_size, false); } static void gaudi_collective_wait_init_cs(struct hl_cs *cs) @@ -7893,7 +7893,7 @@ static u32 gaudi_get_wait_cb_size(struct hl_device *hdev) } static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id, - u32 size) + u32 size, bool eb) { struct hl_cb *cb = (struct hl_cb *) data; struct packet_msg_short *pkt; @@ -7910,7 +7910,7 @@ static u32 gaudi_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id, ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OP_MASK, 0); /* write the value */ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_BASE_MASK, 3); /* W_S SOB base */ ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_OPCODE_MASK, PACKET_MSG_SHORT); - ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, 1); + ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_EB_MASK, eb); ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_RB_MASK, 1); ctl |= FIELD_PREP(GAUDI_PKT_SHORT_CTL_MB_MASK, 1); diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index d61177bf36a5..b8b4aa636b7c 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -5339,7 +5339,7 @@ static u32 goya_get_wait_cb_size(struct hl_device *hdev) } static u32 goya_gen_signal_cb(struct hl_device *hdev, void *data, u16 sob_id, - u32 size) + u32 size, bool eb) { return 0; } From 7a585dfc32110a106f70474c6fa822d912a92c7e Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 30 Nov 2020 14:56:06 +0200 Subject: [PATCH 156/547] habanalabs: Revise comment to align with mirror list name hw_queues_mirror was renamed to cs_mirror, so revise accordingly a comment that refers to this list. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_submission.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index beb482310a58..92c1c516b65f 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -562,7 +562,7 @@ void hl_cs_rollback_all(struct hl_device *hdev) for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) flush_workqueue(hdev->cq_wq[i]); - /* Make sure we don't have leftovers in the H/W queues mirror list */ + /* Make sure we don't have leftovers in the CS mirror list */ list_for_each_entry_safe(cs, tmp, &hdev->cs_mirror_list, mirror_node) { cs_get(cs); cs->aborted = true; From 0024c094851f718ccb0b797255292bdce850a01f Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sat, 5 Dec 2020 22:55:09 +0200 Subject: [PATCH 157/547] habanalabs/gaudi: disable CGM at HW initialization In case the clock gating was enabled in preboot we need to disable it at the H/W initialization stage before touching the MME/TPC registers. Otherwise, the ASIC can get stuck. If the security is enabled in the firmware level, the CGM is always disabled and the driver can't enable it. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 14 +++++++++++--- .../misc/habanalabs/include/common/hl_boot_if.h | 5 +++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 65895ba075fe..f316b898e8e0 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -2403,8 +2403,6 @@ static void gaudi_init_golden_registers(struct hl_device *hdev) gaudi_init_e2e(hdev); gaudi_init_hbm_cred(hdev); - hdev->asic_funcs->disable_clock_gating(hdev); - for (tpc_id = 0, tpc_offset = 0; tpc_id < TPC_NUMBER_OF_ENGINES; tpc_id++, tpc_offset += TPC_CFG_OFFSET) { @@ -3416,6 +3414,9 @@ static void gaudi_set_clock_gating(struct hl_device *hdev) if (hdev->in_debug) return; + if (!hdev->asic_prop.fw_security_disabled) + return; + for (i = GAUDI_PCI_DMA_1, qman_offset = 0 ; i < GAUDI_HBM_DMA_1 ; i++) { enable = !!(hdev->clock_gating_mask & (BIT_ULL(gaudi_dma_assignment[i]))); @@ -3467,7 +3468,7 @@ static void gaudi_disable_clock_gating(struct hl_device *hdev) u32 qman_offset; int i; - if (!(gaudi->hw_cap_initialized & HW_CAP_CLK_GATE)) + if (!hdev->asic_prop.fw_security_disabled) return; for (i = 0, qman_offset = 0 ; i < DMA_NUMBER_OF_CHANNELS ; i++) { @@ -3801,6 +3802,13 @@ static int gaudi_hw_init(struct hl_device *hdev) return rc; } + /* In case the clock gating was enabled in preboot we need to disable + * it here before touching the MME/TPC registers. + * There is no need to take clk gating mutex because when this function + * runs, no other relevant code can run + */ + hdev->asic_funcs->disable_clock_gating(hdev); + /* SRAM scrambler must be initialized after CPU is running from HBM */ gaudi_init_scrambler_sram(hdev); diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h index 755c4800f002..7cb5f2d3e565 100644 --- a/drivers/misc/habanalabs/include/common/hl_boot_if.h +++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h @@ -150,6 +150,10 @@ * CPU_BOOT_DEV_STS0_PLL_INFO_EN FW retrieval of PLL info is enabled. * Initialized in: linux * + * CPU_BOOT_DEV_STS0_CLK_GATE_EN Clock Gating enabled. + * FW initialized Clock Gating. + * Initialized in: preboot + * * CPU_BOOT_DEV_STS0_ENABLED Device status register enabled. * This is a main indication that the * running FW populates the device status @@ -171,6 +175,7 @@ #define CPU_BOOT_DEV_STS0_DRAM_SCR_EN (1 << 9) #define CPU_BOOT_DEV_STS0_FW_HARD_RST_EN (1 << 10) #define CPU_BOOT_DEV_STS0_PLL_INFO_EN (1 << 11) +#define CPU_BOOT_DEV_STS0_CLK_GATE_EN (1 << 13) #define CPU_BOOT_DEV_STS0_ENABLED (1 << 31) enum cpu_boot_status { From 6bbb77b9e6f0bd5595724b7c0cb1189afdd133d3 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Sun, 6 Dec 2020 14:00:35 +0200 Subject: [PATCH 158/547] habanalabs: full FW hard reset support Driver must fetch FW hard reset capability at every FW boot stage: preboot, CPU boot, CPU application. If hard reset is triggered, driver will take into consideration only the last capability received. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 55 +++++++++++++++----- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index c970bfc6db66..20f77f58edef 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -629,32 +629,36 @@ int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg, /* We read security status multiple times during boot: * 1. preboot - a. Check whether the security status bits are valid * b. Check whether fw security is enabled - * c. Check whether hard reset is done by fw - * 2. boot cpu - we get boot cpu security status - * 3. FW application - we get FW application security status + * c. Check whether hard reset is done by preboot + * 2. boot cpu - a. Fetch boot cpu security status + * b. Check whether hard reset is done by boot cpu + * 3. FW application - a. Fetch fw application security status + * b. Check whether hard reset is done by fw app * * Preboot: * Check security status bit (CPU_BOOT_DEV_STS0_ENABLED), if it is set * check security enabled bit (CPU_BOOT_DEV_STS0_SECURITY_EN) */ if (security_status & CPU_BOOT_DEV_STS0_ENABLED) { - hdev->asic_prop.fw_security_status_valid = 1; + prop->fw_security_status_valid = 1; - if (!(security_status & CPU_BOOT_DEV_STS0_SECURITY_EN)) + if (security_status & CPU_BOOT_DEV_STS0_SECURITY_EN) + prop->fw_security_disabled = false; + else prop->fw_security_disabled = true; if (security_status & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN) - hdev->asic_prop.hard_reset_done_by_fw = true; + prop->hard_reset_done_by_fw = true; } else { - hdev->asic_prop.fw_security_status_valid = 0; + prop->fw_security_status_valid = 0; prop->fw_security_disabled = true; } - dev_dbg(hdev->dev, "Firmware hard-reset is %s\n", - hdev->asic_prop.hard_reset_done_by_fw ? "enabled" : "disabled"); + dev_dbg(hdev->dev, "Firmware preboot hard-reset is %s\n", + prop->hard_reset_done_by_fw ? "enabled" : "disabled"); dev_info(hdev->dev, "firmware-level security is %s\n", - prop->fw_security_disabled ? "disabled" : "enabled"); + prop->fw_security_disabled ? "disabled" : "enabled"); return 0; } @@ -664,6 +668,7 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, u32 cpu_security_boot_status_reg, u32 boot_err0_reg, bool skip_bmc, u32 cpu_timeout, u32 boot_fit_timeout) { + struct asic_fixed_properties *prop = &hdev->asic_prop; u32 status; int rc; @@ -732,11 +737,22 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, /* Read U-Boot version now in case we will later fail */ hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_UBOOT); + /* Clear reset status since we need to read it again from boot CPU */ + prop->hard_reset_done_by_fw = false; + /* Read boot_cpu security bits */ - if (hdev->asic_prop.fw_security_status_valid) - hdev->asic_prop.fw_boot_cpu_security_map = + if (prop->fw_security_status_valid) { + prop->fw_boot_cpu_security_map = RREG32(cpu_security_boot_status_reg); + if (prop->fw_boot_cpu_security_map & + CPU_BOOT_DEV_STS0_FW_HARD_RST_EN) + prop->hard_reset_done_by_fw = true; + } + + dev_dbg(hdev->dev, "Firmware boot CPU hard-reset is %s\n", + prop->hard_reset_done_by_fw ? "enabled" : "disabled"); + if (rc) { detect_cpu_boot_status(hdev, status); rc = -EIO; @@ -805,11 +821,22 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, goto out; } + /* Clear reset status since we need to read again from app */ + prop->hard_reset_done_by_fw = false; + /* Read FW application security bits */ - if (hdev->asic_prop.fw_security_status_valid) - hdev->asic_prop.fw_app_security_map = + if (prop->fw_security_status_valid) { + prop->fw_app_security_map = RREG32(cpu_security_boot_status_reg); + if (prop->fw_app_security_map & + CPU_BOOT_DEV_STS0_FW_HARD_RST_EN) + prop->hard_reset_done_by_fw = true; + } + + dev_dbg(hdev->dev, "Firmware application CPU hard-reset is %s\n", + prop->hard_reset_done_by_fw ? "enabled" : "disabled"); + dev_info(hdev->dev, "Successfully loaded firmware to device\n"); out: From 13d0ee10b55ecec01fd3c91e086e4f3ba75a7911 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Sun, 6 Dec 2020 23:48:45 +0200 Subject: [PATCH 159/547] habanalabs/gaudi: enhance reset message Print the initiator who performs the hard-reset for easier debugging. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index f316b898e8e0..b0528883a995 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -3936,11 +3936,15 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset) WREG32(mmPSOC_GLOBAL_CONF_SW_ALL_RST, 1 << PSOC_GLOBAL_CONF_SW_ALL_RST_IND_SHIFT); - } - dev_info(hdev->dev, - "Issued HARD reset command, going to wait %dms\n", - reset_timeout_ms); + dev_info(hdev->dev, + "Issued HARD reset command, going to wait %dms\n", + reset_timeout_ms); + } else { + dev_info(hdev->dev, + "Firmware performs HARD reset, going to wait %dms\n", + reset_timeout_ms); + } /* * After hard reset, we can't poll the BTM_FSM register because the PSOC From 90ffe170a390d5a620f8fe66758514e369e85d24 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 8 Dec 2020 16:52:48 +0200 Subject: [PATCH 160/547] habanalabs: update comment in hl_boot_if.h Hard-reset flag is updated in many stages of the boot sequence of the firmware. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/include/common/hl_boot_if.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h index 7cb5f2d3e565..b637dfd69f6e 100644 --- a/drivers/misc/habanalabs/include/common/hl_boot_if.h +++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h @@ -145,7 +145,7 @@ * implemented. This means that FW will * perform hard reset procedure on * receiving the halt-machine event. - * Initialized in: linux + * Initialized in: preboot, u-boot, linux * * CPU_BOOT_DEV_STS0_PLL_INFO_EN FW retrieval of PLL info is enabled. * Initialized in: linux From 377182a3cc5ae6cc17fb04d06864c975f9f71c18 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 9 Dec 2020 19:50:46 +0200 Subject: [PATCH 161/547] habanalabs: adjust pci controller init to new firmware When the firmware security is enabled, the pcie_aux_dbi_reg_addr register in the PCI controller is blocked. Therefore, ignore the result of writing to this register and assume it worked. Also remove the prints on errors in the internal ELBI write function. If the security is enabled, the firmware is responsible for setting this register correctly so we won't have any problem. If the security is disabled, the write will work (unless something is totally broken at the PCI level and then the whole sequence will fail). In addition, remove a write to register pcie_aux_dbi_reg_addr+4, which was never actually needed. Moreover, PCIE_DBI registers are blocked to access from host when firmware security is enabled. Use a different register to flush the writes. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/pci.c | 28 +++++++++++-------- drivers/misc/habanalabs/gaudi/gaudi.c | 4 +-- .../misc/habanalabs/gaudi/gaudi_coresight.c | 3 +- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/drivers/misc/habanalabs/common/pci.c b/drivers/misc/habanalabs/common/pci.c index 923b2606e29f..b4725e6101f6 100644 --- a/drivers/misc/habanalabs/common/pci.c +++ b/drivers/misc/habanalabs/common/pci.c @@ -130,10 +130,8 @@ static int hl_pci_elbi_write(struct hl_device *hdev, u64 addr, u32 data) if ((val & PCI_CONFIG_ELBI_STS_MASK) == PCI_CONFIG_ELBI_STS_DONE) return 0; - if (val & PCI_CONFIG_ELBI_STS_ERR) { - dev_err(hdev->dev, "Error writing to ELBI\n"); + if (val & PCI_CONFIG_ELBI_STS_ERR) return -EIO; - } if (!(val & PCI_CONFIG_ELBI_STS_MASK)) { dev_err(hdev->dev, "ELBI write didn't finish in time\n"); @@ -160,8 +158,12 @@ int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data) dbi_offset = addr & 0xFFF; - rc = hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0x00300000); - rc |= hl_pci_elbi_write(hdev, prop->pcie_dbi_base_address + dbi_offset, + /* Ignore result of writing to pcie_aux_dbi_reg_addr as it could fail + * in case the firmware security is enabled + */ + hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0x00300000); + + rc = hl_pci_elbi_write(hdev, prop->pcie_dbi_base_address + dbi_offset, data); if (rc) @@ -244,9 +246,11 @@ int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region, rc |= hl_pci_iatu_write(hdev, offset + 0x4, ctrl_reg_val); - /* Return the DBI window to the default location */ - rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0); - rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr + 4, 0); + /* Return the DBI window to the default location + * Ignore result of writing to pcie_aux_dbi_reg_addr as it could fail + * in case the firmware security is enabled + */ + hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0); if (rc) dev_err(hdev->dev, "failed to map bar %u to 0x%08llx\n", @@ -294,9 +298,11 @@ int hl_pci_set_outbound_region(struct hl_device *hdev, /* Enable */ rc |= hl_pci_iatu_write(hdev, 0x004, 0x80000000); - /* Return the DBI window to the default location */ - rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0); - rc |= hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr + 4, 0); + /* Return the DBI window to the default location + * Ignore result of writing to pcie_aux_dbi_reg_addr as it could fail + * in case the firmware security is enabled + */ + hl_pci_elbi_write(hdev, prop->pcie_aux_dbi_reg_addr, 0); return rc; } diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index b0528883a995..88d0e4356d59 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -3761,7 +3761,7 @@ static int gaudi_init_cpu_queues(struct hl_device *hdev, u32 cpu_timeout) static void gaudi_pre_hw_init(struct hl_device *hdev) { /* Perform read from the device to make sure device is up */ - RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG); + RREG32(mmHW_STATE); if (hdev->asic_prop.fw_security_disabled) { /* Set the access through PCI bars (Linux driver only) as @@ -3847,7 +3847,7 @@ static int gaudi_hw_init(struct hl_device *hdev) } /* Perform read from the device to flush all configuration */ - RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG); + RREG32(mmHW_STATE); return 0; diff --git a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c index 2e3612e1ee28..88a09d42e111 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c +++ b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c @@ -9,6 +9,7 @@ #include "../include/gaudi/gaudi_coresight.h" #include "../include/gaudi/asic_reg/gaudi_regs.h" #include "../include/gaudi/gaudi_masks.h" +#include "../include/gaudi/gaudi_reg_map.h" #include #define SPMU_SECTION_SIZE MME0_ACC_SPMU_MAX_OFFSET @@ -874,7 +875,7 @@ int gaudi_debug_coresight(struct hl_device *hdev, void *data) } /* Perform read from the device to flush all configuration */ - RREG32(mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG); + RREG32(mmHW_STATE); return rc; } From 98e8781f008372057bd5cb059ca6b507371e473d Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 9 Dec 2020 23:07:58 +0200 Subject: [PATCH 162/547] habanalabs/gaudi: retry loading TPC f/w on -EINTR If loading the firmware file for the TPC f/w was interrupted, try to do it again, up to 5 times. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 88d0e4356d59..8c09e4466af8 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -838,11 +838,17 @@ static int gaudi_init_tpc_mem(struct hl_device *hdev) size_t fw_size; void *cpu_addr; dma_addr_t dma_handle; - int rc; + int rc, count = 5; +again: rc = request_firmware(&fw, GAUDI_TPC_FW_FILE, hdev->dev); + if (rc == -EINTR && count-- > 0) { + msleep(50); + goto again; + } + if (rc) { - dev_err(hdev->dev, "Firmware file %s is not found!\n", + dev_err(hdev->dev, "Failed to load firmware file %s\n", GAUDI_TPC_FW_FILE); goto out; } From a3fd28306329e8e82efab973aafe81e9001dcf6f Mon Sep 17 00:00:00 2001 From: Alon Mizrahi Date: Tue, 8 Dec 2020 16:14:01 +0200 Subject: [PATCH 163/547] habanalabs: add validation cs counter, fix misplaced counters Up until now validation errors were counted in the parsing field of the cs_counters struct, so we added a new counter and increased it when needed. In addition, there were some locations where only one of the counters was updated (ctx or aggregate) so add the second one to be updated as well. Signed-off-by: Alon Mizrahi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../habanalabs/common/command_submission.c | 75 ++++++++++++++----- drivers/misc/habanalabs/common/habanalabs.h | 2 + .../misc/habanalabs/common/habanalabs_ioctl.c | 5 ++ include/uapi/misc/habanalabs.h | 4 + 4 files changed, 68 insertions(+), 18 deletions(-) diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 92c1c516b65f..b2b3d2b0f808 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -472,8 +472,11 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, cntr = &hdev->aggregated_cs_counters; cs = kzalloc(sizeof(*cs), GFP_ATOMIC); - if (!cs) + if (!cs) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&cntr->out_of_mem_drop_cnt); return -ENOMEM; + } cs->ctx = ctx; cs->submitted = false; @@ -486,6 +489,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, cs_cmpl = kmalloc(sizeof(*cs_cmpl), GFP_ATOMIC); if (!cs_cmpl) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&cntr->out_of_mem_drop_cnt); rc = -ENOMEM; goto free_cs; } @@ -513,6 +518,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues, sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC); if (!cs->jobs_in_queue_cnt) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&cntr->out_of_mem_drop_cnt); rc = -ENOMEM; goto free_fence; } @@ -764,11 +771,14 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args) static int hl_cs_copy_chunk_array(struct hl_device *hdev, struct hl_cs_chunk **cs_chunk_array, - void __user *chunks, u32 num_chunks) + void __user *chunks, u32 num_chunks, + struct hl_ctx *ctx) { u32 size_to_copy; if (num_chunks > HL_MAX_JOBS_PER_CS) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt); dev_err(hdev->dev, "Number of chunks can NOT be larger than %d\n", HL_MAX_JOBS_PER_CS); @@ -777,11 +787,16 @@ static int hl_cs_copy_chunk_array(struct hl_device *hdev, *cs_chunk_array = kmalloc_array(num_chunks, sizeof(**cs_chunk_array), GFP_ATOMIC); - if (!*cs_chunk_array) + if (!*cs_chunk_array) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt); return -ENOMEM; + } size_to_copy = num_chunks * sizeof(struct hl_cs_chunk); if (copy_from_user(*cs_chunk_array, chunks, size_to_copy)) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt); dev_err(hdev->dev, "Failed to copy cs chunk array from user\n"); kfree(*cs_chunk_array); return -EFAULT; @@ -797,6 +812,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, struct hl_device *hdev = hpriv->hdev; struct hl_cs_chunk *cs_chunk_array; struct hl_cs_counters_atomic *cntr; + struct hl_ctx *ctx = hpriv->ctx; struct hl_cs_job *job; struct hl_cs *cs; struct hl_cb *cb; @@ -805,7 +821,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, cntr = &hdev->aggregated_cs_counters; *cs_seq = ULLONG_MAX; - rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks); + rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks, + hpriv->ctx); if (rc) goto out; @@ -832,8 +849,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, rc = validate_queue_index(hdev, chunk, &queue_type, &is_kernel_allocated_cb); if (rc) { - atomic64_inc(&hpriv->ctx->cs_counters.parsing_drop_cnt); - atomic64_inc(&cntr->parsing_drop_cnt); + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); goto free_cs_object; } @@ -841,8 +858,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk); if (!cb) { atomic64_inc( - &hpriv->ctx->cs_counters.parsing_drop_cnt); - atomic64_inc(&cntr->parsing_drop_cnt); + &ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); rc = -EINVAL; goto free_cs_object; } @@ -856,8 +873,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, job = hl_cs_allocate_job(hdev, queue_type, is_kernel_allocated_cb); if (!job) { - atomic64_inc( - &hpriv->ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); atomic64_inc(&cntr->out_of_mem_drop_cnt); dev_err(hdev->dev, "Failed to allocate a new job\n"); rc = -ENOMEM; @@ -891,7 +907,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, rc = cs_parser(hpriv, job); if (rc) { - atomic64_inc(&hpriv->ctx->cs_counters.parsing_drop_cnt); + atomic64_inc(&ctx->cs_counters.parsing_drop_cnt); atomic64_inc(&cntr->parsing_drop_cnt); dev_err(hdev->dev, "Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n", @@ -901,8 +917,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, } if (int_queues_only) { - atomic64_inc(&hpriv->ctx->cs_counters.parsing_drop_cnt); - atomic64_inc(&cntr->parsing_drop_cnt); + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); dev_err(hdev->dev, "Reject CS %d.%llu because only internal queues jobs are present\n", cs->ctx->asid, cs->sequence); @@ -1042,7 +1058,7 @@ out: } static int cs_ioctl_extract_signal_seq(struct hl_device *hdev, - struct hl_cs_chunk *chunk, u64 *signal_seq) + struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx) { u64 *signal_seq_arr = NULL; u32 size_to_copy, signal_seq_arr_len; @@ -1052,6 +1068,8 @@ static int cs_ioctl_extract_signal_seq(struct hl_device *hdev, /* currently only one signal seq is supported */ if (signal_seq_arr_len != 1) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt); dev_err(hdev->dev, "Wait for signal CS supports only one signal CS seq\n"); return -EINVAL; @@ -1060,13 +1078,18 @@ static int cs_ioctl_extract_signal_seq(struct hl_device *hdev, signal_seq_arr = kmalloc_array(signal_seq_arr_len, sizeof(*signal_seq_arr), GFP_ATOMIC); - if (!signal_seq_arr) + if (!signal_seq_arr) { + atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt); + atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt); return -ENOMEM; + } size_to_copy = chunk->num_signal_seq_arr * sizeof(*signal_seq_arr); if (copy_from_user(signal_seq_arr, u64_to_user_ptr(chunk->signal_seq_arr), size_to_copy)) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt); dev_err(hdev->dev, "Failed to copy signal seq array from user\n"); rc = -EFAULT; @@ -1153,6 +1176,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, struct hl_device *hdev = hpriv->hdev; struct hl_cs_compl *sig_waitcs_cmpl; u32 q_idx, collective_engine_id = 0; + struct hl_cs_counters_atomic *cntr; struct hl_fence *sig_fence = NULL; struct hl_ctx *ctx = hpriv->ctx; enum hl_queue_type q_type; @@ -1160,9 +1184,11 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, u64 signal_seq; int rc; + cntr = &hdev->aggregated_cs_counters; *cs_seq = ULLONG_MAX; - rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks); + rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks, + ctx); if (rc) goto out; @@ -1170,6 +1196,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, chunk = &cs_chunk_array[0]; if (chunk->queue_index >= hdev->asic_prop.max_queues) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); dev_err(hdev->dev, "Queue index %d is invalid\n", chunk->queue_index); rc = -EINVAL; @@ -1181,6 +1209,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, q_type = hw_queue_prop->type; if (!hw_queue_prop->supports_sync_stream) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); dev_err(hdev->dev, "Queue index %d does not support sync stream operations\n", q_idx); @@ -1190,6 +1220,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, if (cs_type == CS_TYPE_COLLECTIVE_WAIT) { if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); dev_err(hdev->dev, "Queue index %d is invalid\n", q_idx); rc = -EINVAL; @@ -1200,12 +1232,14 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, } if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT) { - rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq); + rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq, ctx); if (rc) goto free_cs_chunk_array; sig_fence = hl_ctx_get_fence(ctx, signal_seq); if (IS_ERR(sig_fence)) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); dev_err(hdev->dev, "Failed to get signal CS with seq 0x%llx\n", signal_seq); @@ -1223,6 +1257,8 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, container_of(sig_fence, struct hl_cs_compl, base_fence); if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); dev_err(hdev->dev, "CS seq 0x%llx is not of a signal CS\n", signal_seq); @@ -1270,8 +1306,11 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, else if (cs_type == CS_TYPE_COLLECTIVE_WAIT) rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx, cs, q_idx, collective_engine_id); - else + else { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); rc = -EINVAL; + } if (rc) goto free_cs_object; diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 70b778a0d60e..e0d7f5fbaa5c 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1000,6 +1000,7 @@ struct hl_va_range { * @queue_full_drop_cnt: dropped due to queue full * @device_in_reset_drop_cnt: dropped due to device in reset * @max_cs_in_flight_drop_cnt: dropped due to maximum CS in-flight + * @validation_drop_cnt: dropped due to error in validation */ struct hl_cs_counters_atomic { atomic64_t out_of_mem_drop_cnt; @@ -1007,6 +1008,7 @@ struct hl_cs_counters_atomic { atomic64_t queue_full_drop_cnt; atomic64_t device_in_reset_drop_cnt; atomic64_t max_cs_in_flight_drop_cnt; + atomic64_t validation_drop_cnt; }; /** diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index a0c0d20f6f8f..12efbd9d2e3a 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -335,6 +335,8 @@ static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args) atomic64_read(&cntr->device_in_reset_drop_cnt); cs_counters.total_max_cs_in_flight_drop_cnt = atomic64_read(&cntr->max_cs_in_flight_drop_cnt); + cs_counters.total_validation_drop_cnt = + atomic64_read(&cntr->validation_drop_cnt); if (hpriv->ctx) { cs_counters.ctx_out_of_mem_drop_cnt = @@ -352,6 +354,9 @@ static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args) cs_counters.ctx_max_cs_in_flight_drop_cnt = atomic64_read( &hpriv->ctx->cs_counters.max_cs_in_flight_drop_cnt); + cs_counters.ctx_validation_drop_cnt = + atomic64_read( + &hpriv->ctx->cs_counters.validation_drop_cnt); } return copy_to_user(out, &cs_counters, diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index dc8bcec195cc..dba3827c43ca 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -426,6 +426,8 @@ struct hl_info_sync_manager { * @ctx_device_in_reset_drop_cnt: context dropped due to device in reset * @total_max_cs_in_flight_drop_cnt: total dropped due to maximum CS in-flight * @ctx_max_cs_in_flight_drop_cnt: context dropped due to maximum CS in-flight + * @total_validation_drop_cnt: total dropped due to validation error + * @ctx_validation_drop_cnt: context dropped due to validation error */ struct hl_info_cs_counters { __u64 total_out_of_mem_drop_cnt; @@ -438,6 +440,8 @@ struct hl_info_cs_counters { __u64 ctx_device_in_reset_drop_cnt; __u64 total_max_cs_in_flight_drop_cnt; __u64 ctx_max_cs_in_flight_drop_cnt; + __u64 total_validation_drop_cnt; + __u64 ctx_validation_drop_cnt; }; enum gaudi_dcores { From fcaebc7354188b0d708c79df4390fbabd4d9799d Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 14 Dec 2020 12:52:06 +0200 Subject: [PATCH 164/547] habanalabs: register to pci shutdown callback We need to make sure our device is idle when rebooting a virtual machine. This is done in the driver level. The firmware will later handle FLR but we want to be extra safe and stop the devices until the FLR is handled. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 6bbb6bca6860..032d114f01ea 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -544,6 +544,7 @@ static struct pci_driver hl_pci_driver = { .id_table = ids, .probe = hl_pci_probe, .remove = hl_pci_remove, + .shutdown = hl_pci_remove, .driver.pm = &hl_pm_ops, .err_handler = &hl_pci_err_handler, }; From 097c62b6f0ec2bdadf86afbe80df03856338724d Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Tue, 22 Dec 2020 15:21:07 +0200 Subject: [PATCH 165/547] habanalabs: fix order of status check When the device is in reset or needs to be reset, the disabled property is don't-care. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 5871162a8442..0749c92cbcf6 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -17,12 +17,12 @@ enum hl_device_status hl_device_status(struct hl_device *hdev) { enum hl_device_status status; - if (hdev->disabled) - status = HL_DEVICE_STATUS_MALFUNCTION; - else if (atomic_read(&hdev->in_reset)) + if (atomic_read(&hdev->in_reset)) status = HL_DEVICE_STATUS_IN_RESET; else if (hdev->needs_reset) status = HL_DEVICE_STATUS_NEEDS_RESET; + else if (hdev->disabled) + status = HL_DEVICE_STATUS_MALFUNCTION; else status = HL_DEVICE_STATUS_OPERATIONAL; From 95cd4bca7b1f4a25810f3ddfc5e767fb46931789 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 27 Dec 2020 12:33:44 +0100 Subject: [PATCH 166/547] netfilter: nft_dynset: report EOPNOTSUPP on missing set feature If userspace requests a feature which is not available the original set definition, then bail out with EOPNOTSUPP. If userspace sends unsupported dynset flags (new feature not supported by this kernel), then report EOPNOTSUPP to userspace. EINVAL should be only used to report malformed netlink messages from userspace. Fixes: 22fe54d5fefc ("netfilter: nf_tables: add support for dynamic set updates") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_dynset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 983a1d5ca3ab..f35df221a633 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -177,7 +177,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS])); if (flags & ~NFT_DYNSET_F_INV) - return -EINVAL; + return -EOPNOTSUPP; if (flags & NFT_DYNSET_F_INV) priv->invert = true; } @@ -210,7 +210,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, timeout = 0; if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) - return -EINVAL; + return -EOPNOTSUPP; err = nf_msecs_to_jiffies64(tb[NFTA_DYNSET_TIMEOUT], &timeout); if (err) @@ -224,7 +224,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (tb[NFTA_DYNSET_SREG_DATA] != NULL) { if (!(set->flags & NFT_SET_MAP)) - return -EINVAL; + return -EOPNOTSUPP; if (set->dtype == NFT_DATA_VERDICT) return -EOPNOTSUPP; From b4e70d8dd9ea6bd5d5fb3122586f652326ca09cd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 27 Dec 2020 12:35:43 +0100 Subject: [PATCH 167/547] netfilter: nftables: add set expression flags The set flag NFT_SET_EXPR provides a hint to the kernel that userspace supports for multiple expressions per set element. In the same direction, NFT_DYNSET_F_EXPR specifies that dynset expression defines multiple expressions per set element. This allows new userspace software with old kernels to bail out with EOPNOTSUPP. This update is similar to ef516e8625dd ("netfilter: nf_tables: reintroduce the NFT_SET_CONCAT flag"). The NFT_SET_EXPR flag needs to be set on when the NFTA_SET_EXPRESSIONS attribute is specified. The NFT_SET_EXPR flag is not set on with NFTA_SET_EXPR to retain backward compatibility in old userspace binaries. Fixes: 48b0ae046ee9 ("netfilter: nftables: netlink support for several set element expressions") Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 3 +++ net/netfilter/nf_tables_api.c | 6 +++++- net/netfilter/nft_dynset.c | 9 +++++++-- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 28b6ee53305f..b1633e7ba529 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -293,6 +293,7 @@ enum nft_rule_compat_attributes { * @NFT_SET_EVAL: set can be updated from the evaluation path * @NFT_SET_OBJECT: set contains stateful objects * @NFT_SET_CONCAT: set contains a concatenation + * @NFT_SET_EXPR: set contains expressions */ enum nft_set_flags { NFT_SET_ANONYMOUS = 0x1, @@ -303,6 +304,7 @@ enum nft_set_flags { NFT_SET_EVAL = 0x20, NFT_SET_OBJECT = 0x40, NFT_SET_CONCAT = 0x80, + NFT_SET_EXPR = 0x100, }; /** @@ -706,6 +708,7 @@ enum nft_dynset_ops { enum nft_dynset_flags { NFT_DYNSET_F_INV = (1 << 0), + NFT_DYNSET_F_EXPR = (1 << 1), }; /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4186b1e52d58..15c467f1a9dd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4162,7 +4162,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | NFT_SET_INTERVAL | NFT_SET_TIMEOUT | NFT_SET_MAP | NFT_SET_EVAL | - NFT_SET_OBJECT | NFT_SET_CONCAT)) + NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR)) return -EOPNOTSUPP; /* Only one of these operations is supported */ if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == @@ -4304,6 +4304,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, struct nlattr *tmp; int left; + if (!(flags & NFT_SET_EXPR)) { + err = -EINVAL; + goto err_set_alloc_name; + } i = 0; nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { if (i == NFT_SET_EXPR_MAX) { diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index f35df221a633..0b053f75cd60 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -19,6 +19,7 @@ struct nft_dynset { enum nft_registers sreg_key:8; enum nft_registers sreg_data:8; bool invert; + bool expr; u8 num_exprs; u64 timeout; struct nft_expr *expr_array[NFT_SET_EXPR_MAX]; @@ -175,11 +176,12 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (tb[NFTA_DYNSET_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS])); - - if (flags & ~NFT_DYNSET_F_INV) + if (flags & ~(NFT_DYNSET_F_INV | NFT_DYNSET_F_EXPR)) return -EOPNOTSUPP; if (flags & NFT_DYNSET_F_INV) priv->invert = true; + if (flags & NFT_DYNSET_F_EXPR) + priv->expr = true; } set = nft_set_lookup_global(ctx->net, ctx->table, @@ -261,6 +263,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, struct nlattr *tmp; int left; + if (!priv->expr) + return -EINVAL; + i = 0; nla_for_each_nested(tmp, tb[NFTA_DYNSET_EXPRESSIONS], left) { if (i == NFT_SET_EXPR_MAX) { From 2ca408d9c749c32288bc28725f9f12ba30299e8f Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 30 Nov 2020 17:30:59 -0500 Subject: [PATCH 168/547] fanotify: Fix sys_fanotify_mark() on native x86-32 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 121b32a58a3a ("x86/entry/32: Use IA32-specific wrappers for syscalls taking 64-bit arguments") converted native x86-32 which take 64-bit arguments to use the compat handlers to allow conversion to passing args via pt_regs. sys_fanotify_mark() was however missed, as it has a general compat handler. Add a config option that will use the syscall wrapper that takes the split args for native 32-bit. [ bp: Fix typo in Kconfig help text. ] Fixes: 121b32a58a3a ("x86/entry/32: Use IA32-specific wrappers for syscalls taking 64-bit arguments") Reported-by: Paweł Jasiak Signed-off-by: Brian Gerst Signed-off-by: Borislav Petkov Acked-by: Jan Kara Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20201130223059.101286-1-brgerst@gmail.com --- arch/Kconfig | 6 ++++++ arch/x86/Kconfig | 1 + fs/notify/fanotify/fanotify_user.c | 17 +++++++---------- include/linux/syscalls.h | 24 ++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 78c6f05b10f9..24862d15f3a3 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1105,6 +1105,12 @@ config HAVE_ARCH_PFN_VALID config ARCH_SUPPORTS_DEBUG_PAGEALLOC bool +config ARCH_SPLIT_ARG64 + bool + help + If a 32-bit architecture requires 64-bit arguments to be split into + pairs of 32-bit arguments, select this option. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7b6dd10b162a..21f851179ff0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -19,6 +19,7 @@ config X86_32 select KMAP_LOCAL select MODULES_USE_ELF_REL select OLD_SIGACTION + select ARCH_SPLIT_ARG64 config X86_64 def_bool y diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 3e01d8f2ab90..dcab112e1f00 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1285,26 +1285,23 @@ fput_and_out: return ret; } +#ifndef CONFIG_ARCH_SPLIT_ARG64 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, __u64, mask, int, dfd, const char __user *, pathname) { return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname); } +#endif -#ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE6(fanotify_mark, +#if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT) +SYSCALL32_DEFINE6(fanotify_mark, int, fanotify_fd, unsigned int, flags, - __u32, mask0, __u32, mask1, int, dfd, + SC_ARG64(mask), int, dfd, const char __user *, pathname) { - return do_fanotify_mark(fanotify_fd, flags, -#ifdef __BIG_ENDIAN - ((__u64)mask0 << 32) | mask1, -#else - ((__u64)mask1 << 32) | mask0, -#endif - dfd, pathname); + return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask), + dfd, pathname); } #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index f3929aff39cf..7688bc983de5 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -251,6 +251,30 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) #endif /* __SYSCALL_DEFINEx */ +/* For split 64-bit arguments on 32-bit architectures */ +#ifdef __LITTLE_ENDIAN +#define SC_ARG64(name) u32, name##_lo, u32, name##_hi +#else +#define SC_ARG64(name) u32, name##_hi, u32, name##_lo +#endif +#define SC_VAL64(type, name) ((type) name##_hi << 32 | name##_lo) + +#ifdef CONFIG_COMPAT +#define SYSCALL32_DEFINE1 COMPAT_SYSCALL_DEFINE1 +#define SYSCALL32_DEFINE2 COMPAT_SYSCALL_DEFINE2 +#define SYSCALL32_DEFINE3 COMPAT_SYSCALL_DEFINE3 +#define SYSCALL32_DEFINE4 COMPAT_SYSCALL_DEFINE4 +#define SYSCALL32_DEFINE5 COMPAT_SYSCALL_DEFINE5 +#define SYSCALL32_DEFINE6 COMPAT_SYSCALL_DEFINE6 +#else +#define SYSCALL32_DEFINE1 SYSCALL_DEFINE1 +#define SYSCALL32_DEFINE2 SYSCALL_DEFINE2 +#define SYSCALL32_DEFINE3 SYSCALL_DEFINE3 +#define SYSCALL32_DEFINE4 SYSCALL_DEFINE4 +#define SYSCALL32_DEFINE5 SYSCALL_DEFINE5 +#define SYSCALL32_DEFINE6 SYSCALL_DEFINE6 +#endif + /* * Called before coming back to user-mode. Returning to user-mode with an * address limit different than USER_DS can allow to overwrite kernel memory. From 512d4a26abdbd11c6ffa03032740e5ab3c62c55b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 28 Dec 2020 14:03:02 +0200 Subject: [PATCH 169/547] interconnect: qcom: fix rpmh link failures When CONFIG_COMPILE_TEST is set, it is possible to build some of the interconnect drivers into the kernel while their dependencies are loadable modules, which is bad: arm-linux-gnueabi-ld: drivers/interconnect/qcom/bcm-voter.o: in function `qcom_icc_bcm_voter_commit': (.text+0x1f8): undefined reference to `rpmh_invalidate' arm-linux-gnueabi-ld: (.text+0x20c): undefined reference to `rpmh_write_batch' arm-linux-gnueabi-ld: (.text+0x2b0): undefined reference to `rpmh_write_batch' arm-linux-gnueabi-ld: (.text+0x2e8): undefined reference to `rpmh_write_batch' arm-linux-gnueabi-ld: drivers/interconnect/qcom/icc-rpmh.o: in function `qcom_icc_bcm_init': (.text+0x2ac): undefined reference to `cmd_db_read_addr' arm-linux-gnueabi-ld: (.text+0x2c8): undefined reference to `cmd_db_read_aux_data' The exact dependencies are a bit complicated, so split them out into a hidden Kconfig symbol that all drivers can in turn depend on to get it right. Fixes: 976daac4a1c5 ("interconnect: qcom: Consolidate interconnect RPMh support") Signed-off-by: Arnd Bergmann Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20201204165030.3747484-1-arnd@kernel.org Signed-off-by: Georgi Djakov --- drivers/interconnect/qcom/Kconfig | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/interconnect/qcom/Kconfig b/drivers/interconnect/qcom/Kconfig index a8f93ba265f8..b3fb5b02bcf1 100644 --- a/drivers/interconnect/qcom/Kconfig +++ b/drivers/interconnect/qcom/Kconfig @@ -42,13 +42,23 @@ config INTERCONNECT_QCOM_QCS404 This is a driver for the Qualcomm Network-on-Chip on qcs404-based platforms. +config INTERCONNECT_QCOM_RPMH_POSSIBLE + tristate + default INTERCONNECT_QCOM + depends on QCOM_RPMH || (COMPILE_TEST && !QCOM_RPMH) + depends on QCOM_COMMAND_DB || (COMPILE_TEST && !QCOM_COMMAND_DB) + depends on OF || COMPILE_TEST + help + Compile-testing RPMH drivers is possible on other platforms, + but in order to avoid link failures, drivers must not be built-in + when QCOM_RPMH or QCOM_COMMAND_DB are loadable modules + config INTERCONNECT_QCOM_RPMH tristate config INTERCONNECT_QCOM_SC7180 tristate "Qualcomm SC7180 interconnect driver" - depends on INTERCONNECT_QCOM - depends on (QCOM_RPMH && QCOM_COMMAND_DB && OF) || COMPILE_TEST + depends on INTERCONNECT_QCOM_RPMH_POSSIBLE select INTERCONNECT_QCOM_RPMH select INTERCONNECT_QCOM_BCM_VOTER help @@ -57,8 +67,7 @@ config INTERCONNECT_QCOM_SC7180 config INTERCONNECT_QCOM_SDM845 tristate "Qualcomm SDM845 interconnect driver" - depends on INTERCONNECT_QCOM - depends on (QCOM_RPMH && QCOM_COMMAND_DB && OF) || COMPILE_TEST + depends on INTERCONNECT_QCOM_RPMH_POSSIBLE select INTERCONNECT_QCOM_RPMH select INTERCONNECT_QCOM_BCM_VOTER help @@ -67,8 +76,7 @@ config INTERCONNECT_QCOM_SDM845 config INTERCONNECT_QCOM_SM8150 tristate "Qualcomm SM8150 interconnect driver" - depends on INTERCONNECT_QCOM - depends on (QCOM_RPMH && QCOM_COMMAND_DB && OF) || COMPILE_TEST + depends on INTERCONNECT_QCOM_RPMH_POSSIBLE select INTERCONNECT_QCOM_RPMH select INTERCONNECT_QCOM_BCM_VOTER help @@ -77,8 +85,7 @@ config INTERCONNECT_QCOM_SM8150 config INTERCONNECT_QCOM_SM8250 tristate "Qualcomm SM8250 interconnect driver" - depends on INTERCONNECT_QCOM - depends on (QCOM_RPMH && QCOM_COMMAND_DB && OF) || COMPILE_TEST + depends on INTERCONNECT_QCOM_RPMH_POSSIBLE select INTERCONNECT_QCOM_RPMH select INTERCONNECT_QCOM_BCM_VOTER help From c6174c0e058fc0a54e0b9787c44cb24b0a8d0217 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 28 Dec 2020 14:03:02 +0200 Subject: [PATCH 170/547] interconnect: imx: Add a missing of_node_put after of_device_is_available Add an 'of_node_put()' call when a tested device node is not available. Fixes: f0d8048525d7 ("interconnect: Add imx core driver") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/20201206121304.29381-1-christophe.jaillet@wanadoo.fr Signed-off-by: Georgi Djakov --- drivers/interconnect/imx/imx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/interconnect/imx/imx.c b/drivers/interconnect/imx/imx.c index 41dba7090c2a..e398ebf1dbba 100644 --- a/drivers/interconnect/imx/imx.c +++ b/drivers/interconnect/imx/imx.c @@ -99,6 +99,7 @@ static int imx_icc_node_init_qos(struct icc_provider *provider, if (!dn || !of_device_is_available(dn)) { dev_warn(dev, "Missing property %s, skip scaling %s\n", adj->phandle_name, node->name); + of_node_put(dn); return 0; } From 6414b79d02c426b7dd7d942fc19fb38220ea44ec Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 28 Dec 2020 14:03:02 +0200 Subject: [PATCH 171/547] interconnect: imx: Remove a useless test 'dn' can't be NULL here, it is tested just the line above. Remove this useless test. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/20201206121322.29434-1-christophe.jaillet@wanadoo.fr Signed-off-by: Georgi Djakov --- drivers/interconnect/imx/imx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/interconnect/imx/imx.c b/drivers/interconnect/imx/imx.c index e398ebf1dbba..c770951a909c 100644 --- a/drivers/interconnect/imx/imx.c +++ b/drivers/interconnect/imx/imx.c @@ -96,7 +96,7 @@ static int imx_icc_node_init_qos(struct icc_provider *provider, return -ENODEV; } /* Allow scaling to be disabled on a per-node basis */ - if (!dn || !of_device_is_available(dn)) { + if (!of_device_is_available(dn)) { dev_warn(dev, "Missing property %s, skip scaling %s\n", adj->phandle_name, node->name); of_node_put(dn); From 67288f74d4837b82ef937170da3389b0779c17be Mon Sep 17 00:00:00 2001 From: Martin Kepplinger Date: Mon, 28 Dec 2020 14:03:02 +0200 Subject: [PATCH 172/547] interconnect: imx8mq: Use icc_sync_state Add the icc_sync_state callback to notify the framework when consumers are probed and the bandwidth doesn't have to be kept at maximum anymore. Signed-off-by: Martin Kepplinger Suggested-by: Georgi Djakov Fixes: 7d3b0b0d8184 ("interconnect: qcom: Use icc_sync_state") Link: https://lore.kernel.org/r/20201210100906.18205-6-martin.kepplinger@puri.sm Signed-off-by: Georgi Djakov --- drivers/interconnect/imx/imx8mq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/interconnect/imx/imx8mq.c b/drivers/interconnect/imx/imx8mq.c index ba43a15aefec..d7768d3c6d8a 100644 --- a/drivers/interconnect/imx/imx8mq.c +++ b/drivers/interconnect/imx/imx8mq.c @@ -7,6 +7,7 @@ #include #include +#include #include #include "imx.h" @@ -94,6 +95,7 @@ static struct platform_driver imx8mq_icc_driver = { .remove = imx8mq_icc_remove, .driver = { .name = "imx8mq-interconnect", + .sync_state = icc_sync_state, }, }; From 12b38ea040b3bb2a30eb9cd488376df5be7ea81f Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 13 Dec 2020 16:11:05 +0100 Subject: [PATCH 173/547] staging: spmi: hisi-spmi-controller: Fix some error handling paths IN the probe function, if an error occurs after calling 'spmi_controller_alloc()', it must be undone by a corresponding 'spmi_controller_put() call. In the remove function, use 'spmi_controller_put(ctrl)' instead of 'kfree(ctrl)'. While a it fix an error message (s/spmi_add_controller/spmi_controller_add/) Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/20201213151105.137731-1-christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- .../staging/hikey9xx/hisi-spmi-controller.c | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/staging/hikey9xx/hisi-spmi-controller.c b/drivers/staging/hikey9xx/hisi-spmi-controller.c index 861aedd5de48..0d42bc65f39b 100644 --- a/drivers/staging/hikey9xx/hisi-spmi-controller.c +++ b/drivers/staging/hikey9xx/hisi-spmi-controller.c @@ -278,21 +278,24 @@ static int spmi_controller_probe(struct platform_device *pdev) iores = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!iores) { dev_err(&pdev->dev, "can not get resource!\n"); - return -EINVAL; + ret = -EINVAL; + goto err_put_controller; } spmi_controller->base = devm_ioremap(&pdev->dev, iores->start, resource_size(iores)); if (!spmi_controller->base) { dev_err(&pdev->dev, "can not remap base addr!\n"); - return -EADDRNOTAVAIL; + ret = -EADDRNOTAVAIL; + goto err_put_controller; } ret = of_property_read_u32(pdev->dev.of_node, "spmi-channel", &spmi_controller->channel); if (ret) { dev_err(&pdev->dev, "can not get channel\n"); - return -ENODEV; + ret = -ENODEV; + goto err_put_controller; } platform_set_drvdata(pdev, spmi_controller); @@ -309,9 +312,15 @@ static int spmi_controller_probe(struct platform_device *pdev) ctrl->write_cmd = spmi_write_cmd; ret = spmi_controller_add(ctrl); - if (ret) - dev_err(&pdev->dev, "spmi_add_controller failed with error %d!\n", ret); + if (ret) { + dev_err(&pdev->dev, "spmi_controller_add failed with error %d!\n", ret); + goto err_put_controller; + } + return 0; + +err_put_controller: + spmi_controller_put(ctrl); return ret; } @@ -320,7 +329,7 @@ static int spmi_del_controller(struct platform_device *pdev) struct spmi_controller *ctrl = platform_get_drvdata(pdev); spmi_controller_remove(ctrl); - kfree(ctrl); + spmi_controller_put(ctrl); return 0; } From cab36da4bf1a35739b091b73714a39a1bbd02b05 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 2 Dec 2020 09:43:49 +0300 Subject: [PATCH 174/547] Staging: comedi: Return -EFAULT if copy_to_user() fails Return -EFAULT on error instead of the number of bytes remaining to be copied. Fixes: bac42fb21259 ("comedi: get rid of compat_alloc_user_space() mess in COMEDI_CMD{,TEST} compat") Signed-off-by: Dan Carpenter Cc: stable Link: https://lore.kernel.org/r/X8c3pfwFy2jpy4BP@mwanda Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/comedi_fops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/staging/comedi/comedi_fops.c b/drivers/staging/comedi/comedi_fops.c index d99231c737fb..80d74cce2a01 100644 --- a/drivers/staging/comedi/comedi_fops.c +++ b/drivers/staging/comedi/comedi_fops.c @@ -2987,7 +2987,9 @@ static int put_compat_cmd(struct comedi32_cmd_struct __user *cmd32, v32.chanlist_len = cmd->chanlist_len; v32.data = ptr_to_compat(cmd->data); v32.data_len = cmd->data_len; - return copy_to_user(cmd32, &v32, sizeof(v32)); + if (copy_to_user(cmd32, &v32, sizeof(v32))) + return -EFAULT; + return 0; } /* Handle 32-bit COMEDI_CMD ioctl. */ From d887d6104adeb94d1b926936ea21f07367f0ff9f Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 13 Dec 2020 16:35:13 +0100 Subject: [PATCH 175/547] staging: mt7621-dma: Fix a resource leak in an error handling path If an error occurs after calling 'mtk_hsdma_init()', it must be undone by a corresponding call to 'mtk_hsdma_uninit()' as already done in the remove function. Fixes: 0853c7a53eb3 ("staging: mt7621-dma: ralink: add rt2880 dma engine") Signed-off-by: Christophe JAILLET Cc: stable Link: https://lore.kernel.org/r/20201213153513.138723-1-christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- drivers/staging/mt7621-dma/mtk-hsdma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/staging/mt7621-dma/mtk-hsdma.c b/drivers/staging/mt7621-dma/mtk-hsdma.c index d241349214e7..bc4bb4374313 100644 --- a/drivers/staging/mt7621-dma/mtk-hsdma.c +++ b/drivers/staging/mt7621-dma/mtk-hsdma.c @@ -712,7 +712,7 @@ static int mtk_hsdma_probe(struct platform_device *pdev) ret = dma_async_device_register(dd); if (ret) { dev_err(&pdev->dev, "failed to register dma device\n"); - return ret; + goto err_uninit_hsdma; } ret = of_dma_controller_register(pdev->dev.of_node, @@ -728,6 +728,8 @@ static int mtk_hsdma_probe(struct platform_device *pdev) err_unregister: dma_async_device_unregister(dd); +err_uninit_hsdma: + mtk_hsdma_uninit(hsdma); return ret; } From 0ffc76539e6e8d28114f95ac25c167c37b5191b3 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 27 Dec 2020 13:45:02 +0000 Subject: [PATCH 176/547] USB: cdc-acm: blacklist another IR Droid device This device is supported by the IR Toy driver. Reported-by: Georgi Bakalski Signed-off-by: Sean Young Acked-by: Oliver Neukum Cc: stable Link: https://lore.kernel.org/r/20201227134502.4548-2-sean@mess.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index f52f1bc0559f..781905745812 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1895,6 +1895,10 @@ static const struct usb_device_id acm_ids[] = { { USB_DEVICE(0x04d8, 0xfd08), .driver_info = IGNORE_DEVICE, }, + + { USB_DEVICE(0x04d8, 0xf58b), + .driver_info = IGNORE_DEVICE, + }, #endif /*Samsung phone in firmware update mode */ From 421da9413a6a5ec4334cade5092370cf2c8c8add Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Fri, 18 Dec 2020 12:57:36 +0200 Subject: [PATCH 177/547] MAINTAINERS: Update address for Cadence USB3 driver Updates my email address for Cadence USB3 driver. Signed-off-by: Roger Quadros Link: https://lore.kernel.org/r/20201218105736.17667-1-rogerq@ti.com Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 546aa66428c9..afe3a5c66bc9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3883,7 +3883,7 @@ F: drivers/mtd/nand/raw/cadence-nand-controller.c CADENCE USB3 DRD IP DRIVER M: Peter Chen M: Pawel Laszczak -M: Roger Quadros +R: Roger Quadros R: Aswath Govindraju L: linux-usb@vger.kernel.org S: Maintained From 88ebce92806e5dff3549e1a8cacb53978104d3b4 Mon Sep 17 00:00:00 2001 From: Aswath Govindraju Date: Tue, 15 Dec 2020 09:55:49 +0530 Subject: [PATCH 178/547] dt-bindings: usb: Add new compatible string for AM64 SoC Add compatible string in j721e-usb binding file as the same USB subsystem is present in AM64. Reviewed-by: Rob Herring Signed-off-by: Aswath Govindraju Link: https://lore.kernel.org/r/20201215042549.7956-1-a-govindraju@ti.com Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/usb/ti,j721e-usb.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/usb/ti,j721e-usb.yaml b/Documentation/devicetree/bindings/usb/ti,j721e-usb.yaml index 388245b91a55..148b3fb4ceaf 100644 --- a/Documentation/devicetree/bindings/usb/ti,j721e-usb.yaml +++ b/Documentation/devicetree/bindings/usb/ti,j721e-usb.yaml @@ -11,8 +11,12 @@ maintainers: properties: compatible: - items: + oneOf: - const: ti,j721e-usb + - const: ti,am64-usb + - items: + - const: ti,j721e-usb + - const: ti,am64-usb reg: description: module registers From a390bef7db1f192cc5b588dbcf8ed113406ec130 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 10 Dec 2020 18:04:13 -0300 Subject: [PATCH 179/547] usb: gadget: fsl_mxc_udc: Remove the driver Since 5.10-rc1 i.MX is a devicetree-only platform, and this driver was only used by the old non-DT i.MX devices. Remove the driver as it has no users left. Signed-off-by: Fabio Estevam Acked-by: Peter Chen Link: https://lore.kernel.org/r/20201210210413.15262-1-festevam@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/Kconfig | 2 +- drivers/usb/gadget/udc/Makefile | 1 - drivers/usb/gadget/udc/fsl_mxc_udc.c | 122 --------------------------- 3 files changed, 1 insertion(+), 124 deletions(-) delete mode 100644 drivers/usb/gadget/udc/fsl_mxc_udc.c diff --git a/drivers/usb/gadget/udc/Kconfig b/drivers/usb/gadget/udc/Kconfig index 1a12aab208b4..8c614bb86c66 100644 --- a/drivers/usb/gadget/udc/Kconfig +++ b/drivers/usb/gadget/udc/Kconfig @@ -90,7 +90,7 @@ config USB_BCM63XX_UDC config USB_FSL_USB2 tristate "Freescale Highspeed USB DR Peripheral Controller" - depends on FSL_SOC || ARCH_MXC + depends on FSL_SOC help Some of Freescale PowerPC and i.MX processors have a High Speed Dual-Role(DR) USB controller, which supports device mode. diff --git a/drivers/usb/gadget/udc/Makefile b/drivers/usb/gadget/udc/Makefile index f5a7ce28aecd..a21f2224e7eb 100644 --- a/drivers/usb/gadget/udc/Makefile +++ b/drivers/usb/gadget/udc/Makefile @@ -23,7 +23,6 @@ obj-$(CONFIG_USB_ATMEL_USBA) += atmel_usba_udc.o obj-$(CONFIG_USB_BCM63XX_UDC) += bcm63xx_udc.o obj-$(CONFIG_USB_FSL_USB2) += fsl_usb2_udc.o fsl_usb2_udc-y := fsl_udc_core.o -fsl_usb2_udc-$(CONFIG_ARCH_MXC) += fsl_mxc_udc.o obj-$(CONFIG_USB_TEGRA_XUDC) += tegra-xudc.o obj-$(CONFIG_USB_M66592) += m66592-udc.o obj-$(CONFIG_USB_R8A66597) += r8a66597-udc.o diff --git a/drivers/usb/gadget/udc/fsl_mxc_udc.c b/drivers/usb/gadget/udc/fsl_mxc_udc.c deleted file mode 100644 index 5a321992decc..000000000000 --- a/drivers/usb/gadget/udc/fsl_mxc_udc.c +++ /dev/null @@ -1,122 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Copyright (C) 2009 - * Guennadi Liakhovetski, DENX Software Engineering, - * - * Description: - * Helper routines for i.MX3x SoCs from Freescale, needed by the fsl_usb2_udc.c - * driver to function correctly on these systems. - */ -#include -#include -#include -#include -#include -#include -#include - -#include "fsl_usb2_udc.h" - -static struct clk *mxc_ahb_clk; -static struct clk *mxc_per_clk; -static struct clk *mxc_ipg_clk; - -/* workaround ENGcm09152 for i.MX35 */ -#define MX35_USBPHYCTRL_OFFSET 0x600 -#define USBPHYCTRL_OTGBASE_OFFSET 0x8 -#define USBPHYCTRL_EVDO (1 << 23) - -int fsl_udc_clk_init(struct platform_device *pdev) -{ - struct fsl_usb2_platform_data *pdata; - unsigned long freq; - int ret; - - pdata = dev_get_platdata(&pdev->dev); - - mxc_ipg_clk = devm_clk_get(&pdev->dev, "ipg"); - if (IS_ERR(mxc_ipg_clk)) { - dev_err(&pdev->dev, "clk_get(\"ipg\") failed\n"); - return PTR_ERR(mxc_ipg_clk); - } - - mxc_ahb_clk = devm_clk_get(&pdev->dev, "ahb"); - if (IS_ERR(mxc_ahb_clk)) { - dev_err(&pdev->dev, "clk_get(\"ahb\") failed\n"); - return PTR_ERR(mxc_ahb_clk); - } - - mxc_per_clk = devm_clk_get(&pdev->dev, "per"); - if (IS_ERR(mxc_per_clk)) { - dev_err(&pdev->dev, "clk_get(\"per\") failed\n"); - return PTR_ERR(mxc_per_clk); - } - - clk_prepare_enable(mxc_ipg_clk); - clk_prepare_enable(mxc_ahb_clk); - clk_prepare_enable(mxc_per_clk); - - /* make sure USB_CLK is running at 60 MHz +/- 1000 Hz */ - if (!strcmp(pdev->id_entry->name, "imx-udc-mx27")) { - freq = clk_get_rate(mxc_per_clk); - if (pdata->phy_mode != FSL_USB2_PHY_ULPI && - (freq < 59999000 || freq > 60001000)) { - dev_err(&pdev->dev, "USB_CLK=%lu, should be 60MHz\n", freq); - ret = -EINVAL; - goto eclkrate; - } - } - - return 0; - -eclkrate: - clk_disable_unprepare(mxc_ipg_clk); - clk_disable_unprepare(mxc_ahb_clk); - clk_disable_unprepare(mxc_per_clk); - mxc_per_clk = NULL; - return ret; -} - -int fsl_udc_clk_finalize(struct platform_device *pdev) -{ - struct fsl_usb2_platform_data *pdata = dev_get_platdata(&pdev->dev); - int ret = 0; - - /* workaround ENGcm09152 for i.MX35 */ - if (pdata->workaround & FLS_USB2_WORKAROUND_ENGCM09152) { - unsigned int v; - struct resource *res = platform_get_resource - (pdev, IORESOURCE_MEM, 0); - void __iomem *phy_regs = ioremap(res->start + - MX35_USBPHYCTRL_OFFSET, 512); - if (!phy_regs) { - dev_err(&pdev->dev, "ioremap for phy address fails\n"); - ret = -EINVAL; - goto ioremap_err; - } - - v = readl(phy_regs + USBPHYCTRL_OTGBASE_OFFSET); - writel(v | USBPHYCTRL_EVDO, - phy_regs + USBPHYCTRL_OTGBASE_OFFSET); - - iounmap(phy_regs); - } - - -ioremap_err: - /* ULPI transceivers don't need usbpll */ - if (pdata->phy_mode == FSL_USB2_PHY_ULPI) { - clk_disable_unprepare(mxc_per_clk); - mxc_per_clk = NULL; - } - - return ret; -} - -void fsl_udc_clk_release(void) -{ - if (mxc_per_clk) - clk_disable_unprepare(mxc_per_clk); - clk_disable_unprepare(mxc_ahb_clk); - clk_disable_unprepare(mxc_ipg_clk); -} From 5e5ff0b4b6bcb4d17b7a26ec8bcfc7dd4651684f Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 20 Dec 2020 00:25:53 +0900 Subject: [PATCH 180/547] USB: cdc-wdm: Fix use after free in service_outstanding_interrupt(). syzbot is reporting UAF at usb_submit_urb() [1], for service_outstanding_interrupt() is not checking WDM_DISCONNECTING before calling usb_submit_urb(). Close the race by doing same checks wdm_read() does upon retry. Also, while wdm_read() checks WDM_DISCONNECTING with desc->rlock held, service_interrupt_work() does not hold desc->rlock. Thus, it is possible that usb_submit_urb() is called from service_outstanding_interrupt() from service_interrupt_work() after WDM_DISCONNECTING was set and kill_urbs() from wdm_disconnect() completed. Thus, move kill_urbs() in wdm_disconnect() to after cancel_work_sync() (which makes sure that service_interrupt_work() is no longer running) completed. Although it seems to be safe to dereference desc->intf->dev in service_outstanding_interrupt() even if WDM_DISCONNECTING was already set because desc->rlock or cancel_work_sync() prevents wdm_disconnect() from reaching list_del() before service_outstanding_interrupt() completes, let's not emit error message if WDM_DISCONNECTING is set by wdm_disconnect() while usb_submit_urb() is in progress. [1] https://syzkaller.appspot.com/bug?extid=9e04e2df4a32fb661daf Reported-by: syzbot Signed-off-by: Tetsuo Handa Cc: stable Link: https://lore.kernel.org/r/620e2ee0-b9a3-dbda-a25b-a93e0ed03ec5@i-love.sakura.ne.jp Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 02d0cfd23bb2..508b1c3f8b73 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -465,13 +465,23 @@ static int service_outstanding_interrupt(struct wdm_device *desc) if (!desc->resp_count || !--desc->resp_count) goto out; + if (test_bit(WDM_DISCONNECTING, &desc->flags)) { + rv = -ENODEV; + goto out; + } + if (test_bit(WDM_RESETTING, &desc->flags)) { + rv = -EIO; + goto out; + } + set_bit(WDM_RESPONDING, &desc->flags); spin_unlock_irq(&desc->iuspin); rv = usb_submit_urb(desc->response, GFP_KERNEL); spin_lock_irq(&desc->iuspin); if (rv) { - dev_err(&desc->intf->dev, - "usb_submit_urb failed with result %d\n", rv); + if (!test_bit(WDM_DISCONNECTING, &desc->flags)) + dev_err(&desc->intf->dev, + "usb_submit_urb failed with result %d\n", rv); /* make sure the next notification trigger a submit */ clear_bit(WDM_RESPONDING, &desc->flags); @@ -1027,9 +1037,9 @@ static void wdm_disconnect(struct usb_interface *intf) wake_up_all(&desc->wait); mutex_lock(&desc->rlock); mutex_lock(&desc->wlock); - kill_urbs(desc); cancel_work_sync(&desc->rxwork); cancel_work_sync(&desc->service_outs_intr); + kill_urbs(desc); mutex_unlock(&desc->wlock); mutex_unlock(&desc->rlock); From 0f041b8592daaaea46e91a8ebb3b47e6e0171fd8 Mon Sep 17 00:00:00 2001 From: Madhusudanarao Amara Date: Wed, 16 Dec 2020 19:39:18 +0530 Subject: [PATCH 181/547] usb: typec: intel_pmc_mux: Configure HPD first for HPD+IRQ request Warm reboot scenarios some times type C Mux driver gets Mux configuration request as HPD=1,IRQ=1. In that scenario typeC Mux driver need to configure Mux as follows as per IOM requirement: (1). Confgiure Mux HPD = 1, IRQ = 0 (2). Configure Mux with HPD = 1, IRQ = 1 IOM expects TypeC Mux configuration as follows: (1). HPD=1, IRQ=0 (2). HPD=1, IRQ=1 if IOM gets mux config request (2) without configuring (1), it will ignore the request. The impact of this is there is no DP_alt mode display. Fixes: 43d596e32276 ("usb: typec: intel_pmc_mux: Check the port status before connect") Cc: stable@vger.kernel.org Reviewed-by: Heikki Krogerus Signed-off-by: Madhusudanarao Amara Link: https://lore.kernel.org/r/20201216140918.49197-1-madhusudanarao.amara@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/mux/intel_pmc_mux.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/usb/typec/mux/intel_pmc_mux.c b/drivers/usb/typec/mux/intel_pmc_mux.c index cf37a59ce130..46a25b8db72e 100644 --- a/drivers/usb/typec/mux/intel_pmc_mux.c +++ b/drivers/usb/typec/mux/intel_pmc_mux.c @@ -207,10 +207,21 @@ static int pmc_usb_mux_dp_hpd(struct pmc_usb_port *port, struct typec_displayport_data *dp) { u8 msg[2] = { }; + int ret; msg[0] = PMC_USB_DP_HPD; msg[0] |= port->usb3_port << PMC_USB_MSG_USB3_PORT_SHIFT; + /* Configure HPD first if HPD,IRQ comes together */ + if (!IOM_PORT_HPD_ASSERTED(port->iom_status) && + dp->status & DP_STATUS_IRQ_HPD && + dp->status & DP_STATUS_HPD_STATE) { + msg[1] = PMC_USB_DP_HPD_LVL; + ret = pmc_usb_command(port, msg, sizeof(msg)); + if (ret) + return ret; + } + if (dp->status & DP_STATUS_IRQ_HPD) msg[1] = PMC_USB_DP_HPD_IRQ; From 5d5323a6f3625f101dbfa94ba3ef7706cce38760 Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Tue, 15 Dec 2020 20:31:47 +0100 Subject: [PATCH 182/547] USB: xhci: fix U1/U2 handling for hardware with XHCI_INTEL_HOST quirk set The commit 0472bf06c6fd ("xhci: Prevent U1/U2 link pm states if exit latency is too long") was constraining the xhci code not to allow U1/U2 sleep states if the latency to wake up from the U-states reached the service interval of an periodic endpoint. This fix was not taking into account that in case the quirk XHCI_INTEL_HOST is set, the wakeup time will be calculated and configured differently. It checks for u1_params.mel/u2_params.mel as a limit. But the code could decide to write another MEL into the hardware. This leads to broken cases where not enough bandwidth is available for other devices: usb 1-2: can't set config #1, error -28 This patch is fixing that case by checking for timeout_ns after the wakeup time was calculated depending on the quirks. Fixes: 0472bf06c6fd ("xhci: Prevent U1/U2 link pm states if exit latency is too long") Signed-off-by: Michael Grzeschik Cc: stable Link: https://lore.kernel.org/r/20201215193147.11738-1-m.grzeschik@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 91ab81c3fc79..e86940571b4c 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -4770,19 +4770,19 @@ static u16 xhci_calculate_u1_timeout(struct xhci_hcd *xhci, { unsigned long long timeout_ns; - /* Prevent U1 if service interval is shorter than U1 exit latency */ - if (usb_endpoint_xfer_int(desc) || usb_endpoint_xfer_isoc(desc)) { - if (xhci_service_interval_to_ns(desc) <= udev->u1_params.mel) { - dev_dbg(&udev->dev, "Disable U1, ESIT shorter than exit latency\n"); - return USB3_LPM_DISABLED; - } - } - if (xhci->quirks & XHCI_INTEL_HOST) timeout_ns = xhci_calculate_intel_u1_timeout(udev, desc); else timeout_ns = udev->u1_params.sel; + /* Prevent U1 if service interval is shorter than U1 exit latency */ + if (usb_endpoint_xfer_int(desc) || usb_endpoint_xfer_isoc(desc)) { + if (xhci_service_interval_to_ns(desc) <= timeout_ns) { + dev_dbg(&udev->dev, "Disable U1, ESIT shorter than exit latency\n"); + return USB3_LPM_DISABLED; + } + } + /* The U1 timeout is encoded in 1us intervals. * Don't return a timeout of zero, because that's USB3_LPM_DISABLED. */ @@ -4834,19 +4834,19 @@ static u16 xhci_calculate_u2_timeout(struct xhci_hcd *xhci, { unsigned long long timeout_ns; - /* Prevent U2 if service interval is shorter than U2 exit latency */ - if (usb_endpoint_xfer_int(desc) || usb_endpoint_xfer_isoc(desc)) { - if (xhci_service_interval_to_ns(desc) <= udev->u2_params.mel) { - dev_dbg(&udev->dev, "Disable U2, ESIT shorter than exit latency\n"); - return USB3_LPM_DISABLED; - } - } - if (xhci->quirks & XHCI_INTEL_HOST) timeout_ns = xhci_calculate_intel_u2_timeout(udev, desc); else timeout_ns = udev->u2_params.sel; + /* Prevent U2 if service interval is shorter than U2 exit latency */ + if (usb_endpoint_xfer_int(desc) || usb_endpoint_xfer_isoc(desc)) { + if (xhci_service_interval_to_ns(desc) <= timeout_ns) { + dev_dbg(&udev->dev, "Disable U2, ESIT shorter than exit latency\n"); + return USB3_LPM_DISABLED; + } + } + /* The U2 timeout is encoded in 256us intervals */ timeout_ns = DIV_ROUND_UP_ULL(timeout_ns, 256 * 1000); /* If the necessary timeout value is bigger than what we can set in the From a5ada3dfe6a20f41f91448b9034a1ef8da3dc87d Mon Sep 17 00:00:00 2001 From: Zheng Zengkai Date: Tue, 15 Dec 2020 10:54:59 +0800 Subject: [PATCH 183/547] usb: dwc3: meson-g12a: disable clk on error handling path in probe dwc3_meson_g12a_probe() does not invoke clk_bulk_disable_unprepare() on one error handling path. This patch fixes that. Fixes: 347052e3bf1b ("usb: dwc3: meson-g12a: fix USB2 PHY initialization on G12A and A1 SoCs") Reported-by: Hulk Robot Signed-off-by: Zheng Zengkai Cc: stable Reviewed-by: Martin Blumenstingl Link: https://lore.kernel.org/r/20201215025459.91794-1-zhengzengkai@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-meson-g12a.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/dwc3-meson-g12a.c b/drivers/usb/dwc3/dwc3-meson-g12a.c index 417e05381b5d..bdf1f98dfad8 100644 --- a/drivers/usb/dwc3/dwc3-meson-g12a.c +++ b/drivers/usb/dwc3/dwc3-meson-g12a.c @@ -754,7 +754,7 @@ static int dwc3_meson_g12a_probe(struct platform_device *pdev) ret = priv->drvdata->setup_regmaps(priv, base); if (ret) - return ret; + goto err_disable_clks; if (priv->vbus) { ret = regulator_enable(priv->vbus); From 2cc332e4ee4febcbb685e2962ad323fe4b3b750a Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 10 Dec 2020 10:01:48 +0800 Subject: [PATCH 184/547] usb: gadget: function: printer: Fix a memory leak for interface descriptor When printer driver is loaded, the printer_func_bind function is called, in this function, the interface descriptor be allocated memory, if after that, the error occurred, the interface descriptor memory need to be free. Reviewed-by: Peter Chen Cc: Signed-off-by: Zqiang Link: https://lore.kernel.org/r/20201210020148.6691-1-qiang.zhang@windriver.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_printer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c index 64a4112068fc..2f1eb2e81d30 100644 --- a/drivers/usb/gadget/function/f_printer.c +++ b/drivers/usb/gadget/function/f_printer.c @@ -1162,6 +1162,7 @@ fail_tx_reqs: printer_req_free(dev->in_ep, req); } + usb_free_all_descriptors(f); return ret; } From 5cc35c224a80aa5a5a539510ef049faf0d6ed181 Mon Sep 17 00:00:00 2001 From: Sriharsha Allenki Date: Wed, 2 Dec 2020 18:32:20 +0530 Subject: [PATCH 185/547] usb: gadget: Fix spinlock lockup on usb_function_deactivate There is a spinlock lockup as part of composite_disconnect when it tries to acquire cdev->lock as part of usb_gadget_deactivate. This is because the usb_gadget_deactivate is called from usb_function_deactivate with the same spinlock held. This would result in the below call stack and leads to stall. rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: rcu: 3-...0: (1 GPs behind) idle=162/1/0x4000000000000000 softirq=10819/10819 fqs=2356 (detected by 2, t=5252 jiffies, g=20129, q=3770) Task dump for CPU 3: task:uvc-gadget_wlhe state:R running task stack: 0 pid: 674 ppid: 636 flags:0x00000202 Call trace: __switch_to+0xc0/0x170 _raw_spin_lock_irqsave+0x84/0xb0 composite_disconnect+0x28/0x78 configfs_composite_disconnect+0x68/0x70 usb_gadget_disconnect+0x10c/0x128 usb_gadget_deactivate+0xd4/0x108 usb_function_deactivate+0x6c/0x80 uvc_function_disconnect+0x20/0x58 uvc_v4l2_release+0x30/0x88 v4l2_release+0xbc/0xf0 __fput+0x7c/0x230 ____fput+0x14/0x20 task_work_run+0x88/0x140 do_notify_resume+0x240/0x6f0 work_pending+0x8/0x200 Fix this by doing an unlock on cdev->lock before the usb_gadget_deactivate call from usb_function_deactivate. The same lockup can happen in the usb_gadget_activate path. Fix that path as well. Reported-by: Peter Chen Link: https://lore.kernel.org/linux-usb/20201102094936.GA29581@b29397-desktop/ Tested-by: Peter Chen Signed-off-by: Sriharsha Allenki Cc: stable Link: https://lore.kernel.org/r/20201202130220.24926-1-sallenki@codeaurora.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/composite.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index c6d455f2bb92..1a556a628971 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -392,8 +392,11 @@ int usb_function_deactivate(struct usb_function *function) spin_lock_irqsave(&cdev->lock, flags); - if (cdev->deactivations == 0) + if (cdev->deactivations == 0) { + spin_unlock_irqrestore(&cdev->lock, flags); status = usb_gadget_deactivate(cdev->gadget); + spin_lock_irqsave(&cdev->lock, flags); + } if (status == 0) cdev->deactivations++; @@ -424,8 +427,11 @@ int usb_function_activate(struct usb_function *function) status = -EINVAL; else { cdev->deactivations--; - if (cdev->deactivations == 0) + if (cdev->deactivations == 0) { + spin_unlock_irqrestore(&cdev->lock, flags); status = usb_gadget_activate(cdev->gadget); + spin_lock_irqsave(&cdev->lock, flags); + } } spin_unlock_irqrestore(&cdev->lock, flags); From c91d3a6bcaa031f551ba29a496a8027b31289464 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 17 Nov 2020 17:29:55 +0800 Subject: [PATCH 186/547] USB: gadget: legacy: fix return error code in acm_ms_bind() If usb_otg_descriptor_alloc() failed, it need return ENOMEM. Fixes: 578aa8a2b12c ("usb: gadget: acm_ms: allocate and init otg descriptor by otg capabilities") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Cc: stable Link: https://lore.kernel.org/r/20201117092955.4102785-1-yangyingliang@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/acm_ms.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/legacy/acm_ms.c b/drivers/usb/gadget/legacy/acm_ms.c index 59be2d8417c9..e8033e5f0c18 100644 --- a/drivers/usb/gadget/legacy/acm_ms.c +++ b/drivers/usb/gadget/legacy/acm_ms.c @@ -200,8 +200,10 @@ static int acm_ms_bind(struct usb_composite_dev *cdev) struct usb_descriptor_header *usb_desc; usb_desc = usb_otg_descriptor_alloc(gadget); - if (!usb_desc) + if (!usb_desc) { + status = -ENOMEM; goto fail_string_ids; + } usb_otg_descriptor_init(gadget, usb_desc); otg_desc[0] = usb_desc; otg_desc[1] = NULL; From 0a88fa221ce911c331bf700d2214c5b2f77414d3 Mon Sep 17 00:00:00 2001 From: Manish Narani Date: Tue, 17 Nov 2020 12:43:35 +0530 Subject: [PATCH 187/547] usb: gadget: u_ether: Fix MTU size mismatch with RX packet size Fix the MTU size issue with RX packet size as the host sends the packet with extra bytes containing ethernet header. This causes failure when user sets the MTU size to the maximum i.e. 15412. In this case the ethernet packet received will be of length 15412 plus the ethernet header length. This patch fixes the issue where there is a check that RX packet length must not be more than max packet length. Fixes: bba787a860fa ("usb: gadget: ether: Allow jumbo frames") Signed-off-by: Manish Narani Cc: stable Link: https://lore.kernel.org/r/1605597215-122027-1-git-send-email-manish.narani@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/u_ether.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index 31ea76adcc0d..c019f2b0c0af 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -45,9 +45,10 @@ #define UETH__VERSION "29-May-2008" /* Experiments show that both Linux and Windows hosts allow up to 16k - * frame sizes. Set the max size to 15k+52 to prevent allocating 32k + * frame sizes. Set the max MTU size to 15k+52 to prevent allocating 32k * blocks and still have efficient handling. */ -#define GETHER_MAX_ETH_FRAME_LEN 15412 +#define GETHER_MAX_MTU_SIZE 15412 +#define GETHER_MAX_ETH_FRAME_LEN (GETHER_MAX_MTU_SIZE + ETH_HLEN) struct eth_dev { /* lock is held while accessing port_usb @@ -786,7 +787,7 @@ struct eth_dev *gether_setup_name(struct usb_gadget *g, /* MTU range: 14 - 15412 */ net->min_mtu = ETH_HLEN; - net->max_mtu = GETHER_MAX_ETH_FRAME_LEN; + net->max_mtu = GETHER_MAX_MTU_SIZE; dev->gadget = g; SET_NETDEV_DEV(net, &g->dev); @@ -848,7 +849,7 @@ struct net_device *gether_setup_name_default(const char *netname) /* MTU range: 14 - 15412 */ net->min_mtu = ETH_HLEN; - net->max_mtu = GETHER_MAX_ETH_FRAME_LEN; + net->max_mtu = GETHER_MAX_MTU_SIZE; return net; } From 83a43ff80a566de8718dfc6565545a0080ec1fb5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 17 Nov 2020 09:14:30 +0800 Subject: [PATCH 188/547] usb: chipidea: ci_hdrc_imx: add missing put_device() call in usbmisc_get_init_data() if of_find_device_by_node() succeed, usbmisc_get_init_data() doesn't have a corresponding put_device(). Thus add put_device() to fix the exception handling for this function implementation. Fixes: ef12da914ed6 ("usb: chipidea: imx: properly check for usbmisc") Signed-off-by: Yu Kuai Cc: stable Link: https://lore.kernel.org/r/20201117011430.642589-1-yukuai3@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/ci_hdrc_imx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c index 9e12152ea46b..8b7bc10b6e8b 100644 --- a/drivers/usb/chipidea/ci_hdrc_imx.c +++ b/drivers/usb/chipidea/ci_hdrc_imx.c @@ -139,9 +139,13 @@ static struct imx_usbmisc_data *usbmisc_get_init_data(struct device *dev) misc_pdev = of_find_device_by_node(args.np); of_node_put(args.np); - if (!misc_pdev || !platform_get_drvdata(misc_pdev)) + if (!misc_pdev) return ERR_PTR(-EPROBE_DEFER); + if (!platform_get_drvdata(misc_pdev)) { + put_device(&misc_pdev->dev); + return ERR_PTR(-EPROBE_DEFER); + } data->dev = &misc_pdev->dev; /* From 372c93131998c0622304bed118322d2a04489e63 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 14 Dec 2020 11:30:53 +0100 Subject: [PATCH 189/547] USB: yurex: fix control-URB timeout handling Make sure to always cancel the control URB in write() so that it can be reused after a timeout or spurious CMD_ACK. Currently any further write requests after a timeout would fail after triggering a WARN() in usb_submit_urb() when attempting to submit the already active URB. Reported-by: syzbot+e87ebe0f7913f71f2ea5@syzkaller.appspotmail.com Fixes: 6bc235a2e24a ("USB: add driver for Meywa-Denki & Kayac YUREX") Cc: stable # 2.6.37 Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/yurex.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/misc/yurex.c b/drivers/usb/misc/yurex.c index 73ebfa6e9715..c640f98d20c5 100644 --- a/drivers/usb/misc/yurex.c +++ b/drivers/usb/misc/yurex.c @@ -496,6 +496,9 @@ static ssize_t yurex_write(struct file *file, const char __user *user_buffer, timeout = schedule_timeout(YUREX_WRITE_TIMEOUT); finish_wait(&dev->waitq, &wait); + /* make sure URB is idle after timeout or (spurious) CMD_ACK */ + usb_kill_urb(dev->cntl_urb); + mutex_unlock(&dev->io_mutex); if (retval < 0) { From ce722da66d3e9384aa2de9d33d584ee154e5e157 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Thu, 10 Dec 2020 11:50:06 +0300 Subject: [PATCH 190/547] usb: dwc3: ulpi: Use VStsDone to detect PHY regs access completion In accordance with [1] the DWC_usb3 core sets the GUSB2PHYACCn.VStsDone bit when the PHY vendor control access is done and clears it when the application initiates a new transaction. The doc doesn't say anything about the GUSB2PHYACCn.VStsBsy flag serving for the same purpose. Moreover we've discovered that the VStsBsy flag can be cleared before the VStsDone bit. So using the former as a signal of the PHY control registers completion might be dangerous. Let's have the VStsDone flag utilized instead then. [1] Synopsys DesignWare Cores SuperSpeed USB 3.0 xHCI Host Controller Databook, 2.70a, December 2013, p.388 Fixes: 88bc9d194ff6 ("usb: dwc3: add ULPI interface support") Acked-by: Heikki Krogerus Signed-off-by: Serge Semin Link: https://lore.kernel.org/r/20201210085008.13264-2-Sergey.Semin@baikalelectronics.ru Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/core.h | 1 + drivers/usb/dwc3/ulpi.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index 2f95f08ca511..1b241f937d8f 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -285,6 +285,7 @@ /* Global USB2 PHY Vendor Control Register */ #define DWC3_GUSB2PHYACC_NEWREGREQ BIT(25) +#define DWC3_GUSB2PHYACC_DONE BIT(24) #define DWC3_GUSB2PHYACC_BUSY BIT(23) #define DWC3_GUSB2PHYACC_WRITE BIT(22) #define DWC3_GUSB2PHYACC_ADDR(n) (n << 16) diff --git a/drivers/usb/dwc3/ulpi.c b/drivers/usb/dwc3/ulpi.c index aa213c9815f6..3cc4f4970c05 100644 --- a/drivers/usb/dwc3/ulpi.c +++ b/drivers/usb/dwc3/ulpi.c @@ -24,7 +24,7 @@ static int dwc3_ulpi_busyloop(struct dwc3 *dwc) while (count--) { reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYACC(0)); - if (!(reg & DWC3_GUSB2PHYACC_BUSY)) + if (reg & DWC3_GUSB2PHYACC_DONE) return 0; cpu_relax(); } From fca3f138105727c3a22edda32d02f91ce1bf11c9 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Thu, 10 Dec 2020 11:50:07 +0300 Subject: [PATCH 191/547] usb: dwc3: ulpi: Replace CPU-based busyloop with Protocol-based one Originally the procedure of the ULPI transaction finish detection has been developed as a simple busy-loop with just decrementing counter and no delays. It's wrong since on different systems the loop will take a different time to complete. So if the system bus and CPU are fast enough to overtake the ULPI bus and the companion PHY reaction, then we'll get to take a false timeout error. Fix this by converting the busy-loop procedure to take the standard bus speed, address value and the registers access mode into account for the busy-loop delay calculation. Here is the way the fix works. It's known that the ULPI bus is clocked with 60MHz signal. In accordance with [1] the ULPI bus protocol is created so to spend 5 and 6 clock periods for immediate register write and read operations respectively, and 6 and 7 clock periods - for the extended register writes and reads. Based on that we can easily pre-calculate the time which will be needed for the controller to perform a requested IO operation. Note we'll still preserve the attempts counter in case if the DWC USB3 controller has got some internals delays. [1] UTMI+ Low Pin Interface (ULPI) Specification, Revision 1.1, October 20, 2004, pp. 30 - 36. Fixes: 88bc9d194ff6 ("usb: dwc3: add ULPI interface support") Acked-by: Heikki Krogerus Signed-off-by: Serge Semin Link: https://lore.kernel.org/r/20201210085008.13264-3-Sergey.Semin@baikalelectronics.ru Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/ulpi.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/usb/dwc3/ulpi.c b/drivers/usb/dwc3/ulpi.c index 3cc4f4970c05..54c877f7b51d 100644 --- a/drivers/usb/dwc3/ulpi.c +++ b/drivers/usb/dwc3/ulpi.c @@ -7,6 +7,8 @@ * Author: Heikki Krogerus */ +#include +#include #include #include "core.h" @@ -17,12 +19,22 @@ DWC3_GUSB2PHYACC_ADDR(ULPI_ACCESS_EXTENDED) | \ DWC3_GUSB2PHYACC_EXTEND_ADDR(a) : DWC3_GUSB2PHYACC_ADDR(a)) -static int dwc3_ulpi_busyloop(struct dwc3 *dwc) +#define DWC3_ULPI_BASE_DELAY DIV_ROUND_UP(NSEC_PER_SEC, 60000000L) + +static int dwc3_ulpi_busyloop(struct dwc3 *dwc, u8 addr, bool read) { + unsigned long ns = 5L * DWC3_ULPI_BASE_DELAY; unsigned int count = 1000; u32 reg; + if (addr >= ULPI_EXT_VENDOR_SPECIFIC) + ns += DWC3_ULPI_BASE_DELAY; + + if (read) + ns += DWC3_ULPI_BASE_DELAY; + while (count--) { + ndelay(ns); reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYACC(0)); if (reg & DWC3_GUSB2PHYACC_DONE) return 0; @@ -47,7 +59,7 @@ static int dwc3_ulpi_read(struct device *dev, u8 addr) reg = DWC3_GUSB2PHYACC_NEWREGREQ | DWC3_ULPI_ADDR(addr); dwc3_writel(dwc->regs, DWC3_GUSB2PHYACC(0), reg); - ret = dwc3_ulpi_busyloop(dwc); + ret = dwc3_ulpi_busyloop(dwc, addr, true); if (ret) return ret; @@ -71,7 +83,7 @@ static int dwc3_ulpi_write(struct device *dev, u8 addr, u8 val) reg |= DWC3_GUSB2PHYACC_WRITE | val; dwc3_writel(dwc->regs, DWC3_GUSB2PHYACC(0), reg); - return dwc3_ulpi_busyloop(dwc); + return dwc3_ulpi_busyloop(dwc, addr, false); } static const struct ulpi_ops dwc3_ulpi_ops = { From e5f4ca3fce90a37b23a77bfcc86800d484a80514 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Thu, 10 Dec 2020 11:50:08 +0300 Subject: [PATCH 192/547] usb: dwc3: ulpi: Fix USB2.0 HS/FS/LS PHY suspend regression First of all the commit e0082698b689 ("usb: dwc3: ulpi: conditionally resume ULPI PHY") introduced the Suspend USB2.0 HS/FS/LS PHY regression, as by design of the fix any attempt to read/write from/to the PHY control registers will completely disable the PHY suspension, which consequently will increase the USB bus power consumption. Secondly the fix won't work well for the very first attempt of the ULPI PHY control registers IO, because after disabling the USB2.0 PHY suspension functionality it will still take some time for the bus to resume from the sleep state if one has been reached before it. So the very first PHY register read/write operation will take more time than the busy-loop provides and the IO timeout error might be returned anyway. Here we suggest to fix the denoted problems in the following way. First of all let's not disable the Suspend USB2.0 HS/FS/LS PHY functionality so to make the controller and the USB2.0 bus more power efficient. Secondly instead of that we'll extend the PHY IO op wait procedure with 1 - 1.2 ms sleep if the PHY suspension is enabled (1ms should be enough as by LPM specification it is at most how long it takes for the USB2.0 bus to resume from L1 (Sleep) state). Finally in case if the USB2.0 PHY suspension functionality has been disabled on the DWC USB3 controller setup procedure we'll compensate the USB bus resume process latency by extending the busy-loop attempts counter. Fixes: e0082698b689 ("usb: dwc3: ulpi: conditionally resume ULPI PHY") Acked-by: Heikki Krogerus Signed-off-by: Serge Semin Link: https://lore.kernel.org/r/20201210085008.13264-4-Sergey.Semin@baikalelectronics.ru Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/ulpi.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/drivers/usb/dwc3/ulpi.c b/drivers/usb/dwc3/ulpi.c index 54c877f7b51d..f23f4c9a557e 100644 --- a/drivers/usb/dwc3/ulpi.c +++ b/drivers/usb/dwc3/ulpi.c @@ -24,7 +24,7 @@ static int dwc3_ulpi_busyloop(struct dwc3 *dwc, u8 addr, bool read) { unsigned long ns = 5L * DWC3_ULPI_BASE_DELAY; - unsigned int count = 1000; + unsigned int count = 10000; u32 reg; if (addr >= ULPI_EXT_VENDOR_SPECIFIC) @@ -33,6 +33,10 @@ static int dwc3_ulpi_busyloop(struct dwc3 *dwc, u8 addr, bool read) if (read) ns += DWC3_ULPI_BASE_DELAY; + reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYCFG(0)); + if (reg & DWC3_GUSB2PHYCFG_SUSPHY) + usleep_range(1000, 1200); + while (count--) { ndelay(ns); reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYACC(0)); @@ -50,12 +54,6 @@ static int dwc3_ulpi_read(struct device *dev, u8 addr) u32 reg; int ret; - reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYCFG(0)); - if (reg & DWC3_GUSB2PHYCFG_SUSPHY) { - reg &= ~DWC3_GUSB2PHYCFG_SUSPHY; - dwc3_writel(dwc->regs, DWC3_GUSB2PHYCFG(0), reg); - } - reg = DWC3_GUSB2PHYACC_NEWREGREQ | DWC3_ULPI_ADDR(addr); dwc3_writel(dwc->regs, DWC3_GUSB2PHYACC(0), reg); @@ -73,12 +71,6 @@ static int dwc3_ulpi_write(struct device *dev, u8 addr, u8 val) struct dwc3 *dwc = dev_get_drvdata(dev); u32 reg; - reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYCFG(0)); - if (reg & DWC3_GUSB2PHYCFG_SUSPHY) { - reg &= ~DWC3_GUSB2PHYCFG_SUSPHY; - dwc3_writel(dwc->regs, DWC3_GUSB2PHYCFG(0), reg); - } - reg = DWC3_GUSB2PHYACC_NEWREGREQ | DWC3_ULPI_ADDR(addr); reg |= DWC3_GUSB2PHYACC_WRITE | val; dwc3_writel(dwc->regs, DWC3_GUSB2PHYACC(0), reg); From 9389044f27081d6ec77730c36d5bf9a1288bcda2 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 21 Dec 2020 18:35:28 +0100 Subject: [PATCH 193/547] usb: gadget: f_uac2: reset wMaxPacketSize With commit 913e4a90b6f9 ("usb: gadget: f_uac2: finalize wMaxPacketSize according to bandwidth") wMaxPacketSize is computed dynamically but the value is never reset. Because of this, the actual maximum packet size can only decrease each time the audio gadget is instantiated. Reset the endpoint maximum packet size and mark wMaxPacketSize as dynamic to solve the problem. Fixes: 913e4a90b6f9 ("usb: gadget: f_uac2: finalize wMaxPacketSize according to bandwidth") Signed-off-by: Jerome Brunet Cc: stable Link: https://lore.kernel.org/r/20201221173531.215169-2-jbrunet@baylibre.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_uac2.c | 69 ++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/drivers/usb/gadget/function/f_uac2.c b/drivers/usb/gadget/function/f_uac2.c index 3633df6d7610..5d960b6603b6 100644 --- a/drivers/usb/gadget/function/f_uac2.c +++ b/drivers/usb/gadget/function/f_uac2.c @@ -271,7 +271,7 @@ static struct usb_endpoint_descriptor fs_epout_desc = { .bEndpointAddress = USB_DIR_OUT, .bmAttributes = USB_ENDPOINT_XFER_ISOC | USB_ENDPOINT_SYNC_ASYNC, - .wMaxPacketSize = cpu_to_le16(1023), + /* .wMaxPacketSize = DYNAMIC */ .bInterval = 1, }; @@ -280,7 +280,7 @@ static struct usb_endpoint_descriptor hs_epout_desc = { .bDescriptorType = USB_DT_ENDPOINT, .bmAttributes = USB_ENDPOINT_XFER_ISOC | USB_ENDPOINT_SYNC_ASYNC, - .wMaxPacketSize = cpu_to_le16(1024), + /* .wMaxPacketSize = DYNAMIC */ .bInterval = 4, }; @@ -348,7 +348,7 @@ static struct usb_endpoint_descriptor fs_epin_desc = { .bEndpointAddress = USB_DIR_IN, .bmAttributes = USB_ENDPOINT_XFER_ISOC | USB_ENDPOINT_SYNC_ASYNC, - .wMaxPacketSize = cpu_to_le16(1023), + /* .wMaxPacketSize = DYNAMIC */ .bInterval = 1, }; @@ -357,7 +357,7 @@ static struct usb_endpoint_descriptor hs_epin_desc = { .bDescriptorType = USB_DT_ENDPOINT, .bmAttributes = USB_ENDPOINT_XFER_ISOC | USB_ENDPOINT_SYNC_ASYNC, - .wMaxPacketSize = cpu_to_le16(1024), + /* .wMaxPacketSize = DYNAMIC */ .bInterval = 4, }; @@ -444,12 +444,28 @@ struct cntrl_range_lay3 { __le32 dRES; } __packed; -static void set_ep_max_packet_size(const struct f_uac2_opts *uac2_opts, +static int set_ep_max_packet_size(const struct f_uac2_opts *uac2_opts, struct usb_endpoint_descriptor *ep_desc, - unsigned int factor, bool is_playback) + enum usb_device_speed speed, bool is_playback) { int chmask, srate, ssize; - u16 max_packet_size; + u16 max_size_bw, max_size_ep; + unsigned int factor; + + switch (speed) { + case USB_SPEED_FULL: + max_size_ep = 1023; + factor = 1000; + break; + + case USB_SPEED_HIGH: + max_size_ep = 1024; + factor = 8000; + break; + + default: + return -EINVAL; + } if (is_playback) { chmask = uac2_opts->p_chmask; @@ -461,10 +477,12 @@ static void set_ep_max_packet_size(const struct f_uac2_opts *uac2_opts, ssize = uac2_opts->c_ssize; } - max_packet_size = num_channels(chmask) * ssize * + max_size_bw = num_channels(chmask) * ssize * DIV_ROUND_UP(srate, factor / (1 << (ep_desc->bInterval - 1))); - ep_desc->wMaxPacketSize = cpu_to_le16(min_t(u16, max_packet_size, - le16_to_cpu(ep_desc->wMaxPacketSize))); + ep_desc->wMaxPacketSize = cpu_to_le16(min_t(u16, max_size_bw, + max_size_ep)); + + return 0; } /* Use macro to overcome line length limitation */ @@ -670,10 +688,33 @@ afunc_bind(struct usb_configuration *cfg, struct usb_function *fn) } /* Calculate wMaxPacketSize according to audio bandwidth */ - set_ep_max_packet_size(uac2_opts, &fs_epin_desc, 1000, true); - set_ep_max_packet_size(uac2_opts, &fs_epout_desc, 1000, false); - set_ep_max_packet_size(uac2_opts, &hs_epin_desc, 8000, true); - set_ep_max_packet_size(uac2_opts, &hs_epout_desc, 8000, false); + ret = set_ep_max_packet_size(uac2_opts, &fs_epin_desc, USB_SPEED_FULL, + true); + if (ret < 0) { + dev_err(dev, "%s:%d Error!\n", __func__, __LINE__); + return ret; + } + + ret = set_ep_max_packet_size(uac2_opts, &fs_epout_desc, USB_SPEED_FULL, + false); + if (ret < 0) { + dev_err(dev, "%s:%d Error!\n", __func__, __LINE__); + return ret; + } + + ret = set_ep_max_packet_size(uac2_opts, &hs_epin_desc, USB_SPEED_HIGH, + true); + if (ret < 0) { + dev_err(dev, "%s:%d Error!\n", __func__, __LINE__); + return ret; + } + + ret = set_ep_max_packet_size(uac2_opts, &hs_epout_desc, USB_SPEED_HIGH, + false); + if (ret < 0) { + dev_err(dev, "%s:%d Error!\n", __func__, __LINE__); + return ret; + } if (EPOUT_EN(uac2_opts)) { agdev->out_ep = usb_ep_autoconfig(gadget, &fs_epout_desc); From 60267ba35c744d851dcd2d22ebaa240ca6aaa15f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 16 Dec 2020 17:19:58 +0100 Subject: [PATCH 194/547] ceph: reencode gid_list when reconnecting On reconnect, cap and dentry releases are dropped and the fields that follow must be reencoded into the freed space. Currently these are timestamp and gid_list, but gid_list isn't reencoded. This results in failed to decode message of type 24 v4: End of buffer errors on the MDS. While at it, make a change to encode gid_list unconditionally, without regard to what head/which version was used as a result of checking whether CEPH_FEATURE_FS_BTIME is supported or not. URL: https://tracker.ceph.com/issues/48618 Fixes: 4f1ddb1ea874 ("ceph: implement updated ceph_mds_request_head structure") Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- fs/ceph/mds_client.c | 53 ++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 98c15ff2e599..840587037b59 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2475,6 +2475,22 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, return r; } +static void encode_timestamp_and_gids(void **p, + const struct ceph_mds_request *req) +{ + struct ceph_timespec ts; + int i; + + ceph_encode_timespec64(&ts, &req->r_stamp); + ceph_encode_copy(p, &ts, sizeof(ts)); + + /* gid_list */ + ceph_encode_32(p, req->r_cred->group_info->ngroups); + for (i = 0; i < req->r_cred->group_info->ngroups; i++) + ceph_encode_64(p, from_kgid(&init_user_ns, + req->r_cred->group_info->gid[i])); +} + /* * called under mdsc->mutex */ @@ -2491,7 +2507,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; bool freepath1 = false, freepath2 = false; - int len, i; + int len; u16 releases; void *p, *end; int ret; @@ -2517,17 +2533,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, goto out_free1; } - if (legacy) { - /* Old style */ - len = sizeof(*head); - } else { - /* New style: add gid_list and any later fields */ - len = sizeof(struct ceph_mds_request_head) + sizeof(u32) + - (sizeof(u64) * req->r_cred->group_info->ngroups); - } - + len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head); len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + sizeof(struct ceph_timespec); + len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups); /* calculate (max) length for cap releases */ len += sizeof(struct ceph_mds_request_release) * @@ -2548,7 +2557,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, msg->hdr.tid = cpu_to_le64(req->r_tid); /* - * The old ceph_mds_request_header didn't contain a version field, and + * The old ceph_mds_request_head didn't contain a version field, and * one was added when we moved the message version from 3->4. */ if (legacy) { @@ -2609,20 +2618,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, head->num_releases = cpu_to_le16(releases); - /* time stamp */ - { - struct ceph_timespec ts; - ceph_encode_timespec64(&ts, &req->r_stamp); - ceph_encode_copy(&p, &ts, sizeof(ts)); - } - - /* gid list */ - if (!legacy) { - ceph_encode_32(&p, req->r_cred->group_info->ngroups); - for (i = 0; i < req->r_cred->group_info->ngroups; i++) - ceph_encode_64(&p, from_kgid(&init_user_ns, - req->r_cred->group_info->gid[i])); - } + encode_timestamp_and_gids(&p, req); if (WARN_ON_ONCE(p > end)) { ceph_msg_put(msg); @@ -2730,13 +2726,8 @@ static int __prepare_send_request(struct ceph_mds_session *session, /* remove cap/dentry releases from message */ rhead->num_releases = 0; - /* time stamp */ p = msg->front.iov_base + req->r_request_release_offset; - { - struct ceph_timespec ts; - ceph_encode_timespec64(&ts, &req->r_stamp); - ceph_encode_copy(&p, &ts, sizeof(ts)); - } + encode_timestamp_and_gids(&p, req); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); From ad32fe8801c38f7b1a8b3814bd1f006cb2b5e781 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 15 Dec 2020 16:40:59 +0100 Subject: [PATCH 195/547] libceph: fix auth_signature buffer allocation in secure mode auth_signature frame is 68 bytes in plain mode and 96 bytes in secure mode but we are requesting 68 bytes in both modes. By luck, this doesn't actually result in any invalid memory accesses because the allocation is satisfied out of kmalloc-96 slab and so exactly 96 bytes are allocated, but KASAN rightfully complains. Fixes: cd1a677cad99 ("libceph, ceph: implement msgr2.1 protocol (crc and secure modes)") Reported-by: Luis Henriques Signed-off-by: Ilya Dryomov --- net/ceph/messenger_v2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index c1ebb2aa08b5..4f938fc8deaf 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -1333,7 +1333,8 @@ static int prepare_auth_signature(struct ceph_connection *con) void *buf; int ret; - buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, false)); + buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, + con_secure(con))); if (!buf) return -ENOMEM; From f5f2c9a0e3073debc6bc0ecc855ced0158526ee8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 15 Dec 2020 16:49:07 +0100 Subject: [PATCH 196/547] libceph: align session_key and con_secret to 16 bytes crypto_shash_setkey() and crypto_aead_setkey() will do a (small) GFP_ATOMIC allocation to align the key if it isn't suitably aligned. It's not a big deal, but at the same time easy to avoid. The actual alignment requirement is dynamic, queryable with crypto_shash_alignmask() and crypto_aead_alignmask(), but shouldn't be stricter than 16 bytes for our algorithms. Fixes: cd1a677cad99 ("libceph, ceph: implement msgr2.1 protocol (crc and secure modes)") Signed-off-by: Ilya Dryomov --- net/ceph/messenger_v2.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 4f938fc8deaf..c38d8de93836 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -2033,10 +2033,18 @@ bad: return -EINVAL; } +/* + * Align session_key and con_secret to avoid GFP_ATOMIC allocation + * inside crypto_shash_setkey() and crypto_aead_setkey() called from + * setup_crypto(). __aligned(16) isn't guaranteed to work for stack + * objects, so do it by hand. + */ static int process_auth_done(struct ceph_connection *con, void *p, void *end) { - u8 session_key[CEPH_KEY_LEN]; - u8 con_secret[CEPH_MAX_CON_SECRET_LEN]; + u8 session_key_buf[CEPH_KEY_LEN + 16]; + u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16]; + u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16); + u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16); int session_key_len, con_secret_len; int payload_len; u64 global_id; From 664f1e259a982bf213f0cd8eea7616c89546585c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 16 Dec 2020 18:40:05 +0100 Subject: [PATCH 197/547] libceph: add __maybe_unused to DEFINE_MSGR2_FEATURE Avoid -Wunused-const-variable warnings for "make W=1". Reported-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/msgr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h index f5e02f6c0655..3989dcb94d3d 100644 --- a/include/linux/ceph/msgr.h +++ b/include/linux/ceph/msgr.h @@ -33,8 +33,8 @@ #define CEPH_MSGR2_INCARNATION_1 (0ull) #define DEFINE_MSGR2_FEATURE(bit, incarnation, name) \ - static const uint64_t CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \ - static const uint64_t CEPH_MSGR2_FEATUREMASK_##name = \ + static const uint64_t __maybe_unused CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \ + static const uint64_t __maybe_unused CEPH_MSGR2_FEATUREMASK_##name = \ (1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation); #define HAVE_MSGR2_FEATURE(x, name) \ From 48b0777cd93dbd800d3966b6f5c34714aad5c203 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 28 Dec 2020 16:13:52 -0500 Subject: [PATCH 198/547] Revert "dm crypt: export sysfs of kcryptd workqueue" This reverts commit a2b8b2d975673b1a50ab0bcce5d146b9335edfad. WQ_SYSFS breaks the ability to reload a DM table due to sysfs kobject collision (due to active and inactive table). Given lack of demonstrated need for exposing this workqueue via sysfs: revert exposing it. Reported-by: Ignat Korchagin Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 5f9f9b3a226d..53791138d78b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3166,12 +3166,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd-%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, + cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1, devname); else - cc->crypt_queue = alloc_workqueue("kcryptd-%s", - WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | - WQ_UNBOUND | WQ_SYSFS, + cc->crypt_queue = alloc_workqueue("kcryptd/%s", + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus(), devname); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; From 59b4a8fa27f5a895582ada1ae5034af7c94a57b5 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 23 Dec 2020 19:21:16 -0800 Subject: [PATCH 199/547] CDC-NCM: remove "connected" log message The cdc_ncm driver passes network connection notifications up to usbnet_link_change(), which is the right place for any logging. Remove the netdev_info() duplicating this from the driver itself. This stops devices such as my "TRENDnet USB 10/100/1G/2.5G LAN" (ID 20f4:e02b) adapter from spamming the kernel log with cdc_ncm 2-2:2.0 enp0s2u2c2: network connection: connected messages every 60 msec or so. Signed-off-by: Roland Dreier Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20201224032116.2453938-1-roland@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/usb/cdc_ncm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 2bac57d5e8d5..3b816a4731f2 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -1863,9 +1863,6 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb) * USB_CDC_NOTIFY_NETWORK_CONNECTION notification shall be * sent by device after USB_CDC_NOTIFY_SPEED_CHANGE. */ - netif_info(dev, link, dev->net, - "network connection: %sconnected\n", - !!event->wValue ? "" : "dis"); usbnet_link_change(dev, !!event->wValue, 0); break; From 1ad58225dba3f2f598d2c6daed4323f24547168f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 23 Dec 2020 22:23:20 +0100 Subject: [PATCH 200/547] net-sysfs: take the rtnl lock when storing xps_cpus Two race conditions can be triggered when storing xps cpus, resulting in various oops and invalid memory accesses: 1. Calling netdev_set_num_tc while netif_set_xps_queue: - netif_set_xps_queue uses dev->tc_num as one of the parameters to compute the size of new_dev_maps when allocating it. dev->tc_num is also used to access the map, and the compiler may generate code to retrieve this field multiple times in the function. - netdev_set_num_tc sets dev->tc_num. If new_dev_maps is allocated using dev->tc_num and then dev->tc_num is set to a higher value through netdev_set_num_tc, later accesses to new_dev_maps in netif_set_xps_queue could lead to accessing memory outside of new_dev_maps; triggering an oops. 2. Calling netif_set_xps_queue while netdev_set_num_tc is running: 2.1. netdev_set_num_tc starts by resetting the xps queues, dev->tc_num isn't updated yet. 2.2. netif_set_xps_queue is called, setting up the map with the *old* dev->num_tc. 2.3. netdev_set_num_tc updates dev->tc_num. 2.4. Later accesses to the map lead to out of bound accesses and oops. A similar issue can be found with netdev_reset_tc. One way of triggering this is to set an iface up (for which the driver uses netdev_set_num_tc in the open path, such as bnx2x) and writing to xps_cpus in a concurrent thread. With the right timing an oops is triggered. Both issues have the same fix: netif_set_xps_queue, netdev_set_num_tc and netdev_reset_tc should be mutually exclusive. We do that by taking the rtnl lock in xps_cpus_store. Fixes: 184c449f91fe ("net: Add support for XPS with QoS via traffic classes") Signed-off-by: Antoine Tenart Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 999b70c59761..7cc15dec1717 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1396,7 +1396,13 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, return err; } + if (!rtnl_trylock()) { + free_cpumask_var(mask); + return restart_syscall(); + } + err = netif_set_xps_queue(dev, mask, index); + rtnl_unlock(); free_cpumask_var(mask); From fb25038586d0064123e393cadf1fadd70a9df97a Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 23 Dec 2020 22:23:21 +0100 Subject: [PATCH 201/547] net-sysfs: take the rtnl lock when accessing xps_cpus_map and num_tc Accesses to dev->xps_cpus_map (when using dev->num_tc) should be protected by the rtnl lock, like we do for netif_set_xps_queue. I didn't see an actual bug being triggered, but let's be safe here and take the rtnl lock while accessing the map in sysfs. Fixes: 184c449f91fe ("net: Add support for XPS with QoS via traffic classes") Signed-off-by: Antoine Tenart Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7cc15dec1717..65886bfbf822 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1317,8 +1317,8 @@ static const struct attribute_group dql_group = { static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) { + int cpu, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; - int cpu, len, num_tc = 1, tc = 0; struct xps_dev_maps *dev_maps; cpumask_var_t mask; unsigned long index; @@ -1328,22 +1328,31 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, index = get_netdev_queue_index(queue); + if (!rtnl_trylock()) + return restart_syscall(); + if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; - if (num_tc < 0) - return -EINVAL; + if (num_tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } } - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_rtnl_unlock; + } rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_cpus_map); @@ -1366,9 +1375,15 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } rcu_read_unlock(); + rtnl_unlock(); + len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); return len < PAGE_SIZE ? len : -EINVAL; + +err_rtnl_unlock: + rtnl_unlock(); + return ret; } static ssize_t xps_cpus_store(struct netdev_queue *queue, From 2d57b4f142e0b03e854612b8e28978935414bced Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 23 Dec 2020 22:23:22 +0100 Subject: [PATCH 202/547] net-sysfs: take the rtnl lock when storing xps_rxqs Two race conditions can be triggered when storing xps rxqs, resulting in various oops and invalid memory accesses: 1. Calling netdev_set_num_tc while netif_set_xps_queue: - netif_set_xps_queue uses dev->tc_num as one of the parameters to compute the size of new_dev_maps when allocating it. dev->tc_num is also used to access the map, and the compiler may generate code to retrieve this field multiple times in the function. - netdev_set_num_tc sets dev->tc_num. If new_dev_maps is allocated using dev->tc_num and then dev->tc_num is set to a higher value through netdev_set_num_tc, later accesses to new_dev_maps in netif_set_xps_queue could lead to accessing memory outside of new_dev_maps; triggering an oops. 2. Calling netif_set_xps_queue while netdev_set_num_tc is running: 2.1. netdev_set_num_tc starts by resetting the xps queues, dev->tc_num isn't updated yet. 2.2. netif_set_xps_queue is called, setting up the map with the *old* dev->num_tc. 2.3. netdev_set_num_tc updates dev->tc_num. 2.4. Later accesses to the map lead to out of bound accesses and oops. A similar issue can be found with netdev_reset_tc. One way of triggering this is to set an iface up (for which the driver uses netdev_set_num_tc in the open path, such as bnx2x) and writing to xps_rxqs in a concurrent thread. With the right timing an oops is triggered. Both issues have the same fix: netif_set_xps_queue, netdev_set_num_tc and netdev_reset_tc should be mutually exclusive. We do that by taking the rtnl lock in xps_rxqs_store. Fixes: 8af2c06ff4b1 ("net-sysfs: Add interface for Rx queue(s) map per Tx queue") Signed-off-by: Antoine Tenart Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 65886bfbf822..62ca2f2c0ee6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1499,10 +1499,17 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } + if (!rtnl_trylock()) { + bitmap_free(mask); + return restart_syscall(); + } + cpus_read_lock(); err = __netif_set_xps_queue(dev, mask, index, true); cpus_read_unlock(); + rtnl_unlock(); + bitmap_free(mask); return err ? : len; } From 4ae2bb81649dc03dfc95875f02126b14b773f7ab Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 23 Dec 2020 22:23:23 +0100 Subject: [PATCH 203/547] net-sysfs: take the rtnl lock when accessing xps_rxqs_map and num_tc Accesses to dev->xps_rxqs_map (when using dev->num_tc) should be protected by the rtnl lock, like we do for netif_set_xps_queue. I didn't see an actual bug being triggered, but let's be safe here and take the rtnl lock while accessing the map in sysfs. Fixes: 8af2c06ff4b1 ("net-sysfs: Add interface for Rx queue(s) map per Tx queue") Signed-off-by: Antoine Tenart Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 62ca2f2c0ee6..daf502c13d6d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1429,22 +1429,29 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { + int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; unsigned long *mask, index; - int j, len, num_tc = 1, tc = 0; index = get_netdev_queue_index(queue); + if (!rtnl_trylock()) + return restart_syscall(); + if (dev->num_tc) { num_tc = dev->num_tc; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } } mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); - if (!mask) - return -ENOMEM; + if (!mask) { + ret = -ENOMEM; + goto err_rtnl_unlock; + } rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_rxqs_map); @@ -1470,10 +1477,16 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) out_no_maps: rcu_read_unlock(); + rtnl_unlock(); + len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; + +err_rtnl_unlock: + rtnl_unlock(); + return ret; } static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, From 4614792eebcbf81c60ad3604c1aeeb2b0899cea4 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 24 Dec 2020 18:24:05 +0200 Subject: [PATCH 204/547] net: ethernet: ti: cpts: fix ethtool output when no ptp_clock registered The CPTS driver registers PTP PHC clock when first netif is going up and unregister it when all netif are down. Now ethtool will show: - PTP PHC clock index 0 after boot until first netif is up; - the last assigned PTP PHC clock index even if PTP PHC clock is not registered any more after all netifs are down. This patch ensures that -1 is returned by ethtool when PTP PHC clock is not registered any more. Fixes: 8a2c9a5ab4b9 ("net: ethernet: ti: cpts: rework initialization/deinitialization") Signed-off-by: Grygorii Strashko Acked-by: Richard Cochran Link: https://lore.kernel.org/r/20201224162405.28032-1-grygorii.strashko@ti.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/cpts.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index d1fc7955d422..43222a34cba0 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -599,6 +599,7 @@ void cpts_unregister(struct cpts *cpts) ptp_clock_unregister(cpts->clock); cpts->clock = NULL; + cpts->phc_index = -1; cpts_write32(cpts, 0, int_enable); cpts_write32(cpts, 0, control); @@ -784,6 +785,7 @@ struct cpts *cpts_create(struct device *dev, void __iomem *regs, cpts->cc.read = cpts_systim_read; cpts->cc.mask = CLOCKSOURCE_MASK(32); cpts->info = cpts_info; + cpts->phc_index = -1; if (n_ext_ts) cpts->info.n_ext_ts = n_ext_ts; From 950271d7cc0b4546af3549d8143c4132d6e1f138 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Fri, 25 Dec 2020 10:52:16 +0800 Subject: [PATCH 205/547] tun: fix return value when the number of iovs exceeds MAX_SKB_FRAGS Currently the tun_napi_alloc_frags() function returns -ENOMEM when the number of iovs exceeds MAX_SKB_FRAGS + 1. However this is inappropriate, we should use -EMSGSIZE instead of -ENOMEM. The following distinctions are matters: 1. the caller need to drop the bad packet when -EMSGSIZE is returned, which means meeting a persistent failure. 2. the caller can try again when -ENOMEM is returned, which means meeting a transient failure. Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver") Signed-off-by: Yunjian Wang Acked-by: Willem de Bruijn Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/1608864736-24332-1-git-send-email-wangyunjian@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index fbed05ae7b0f..978ac0981d16 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1365,7 +1365,7 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, int i; if (it->nr_segs > MAX_SKB_FRAGS + 1) - return ERR_PTR(-ENOMEM); + return ERR_PTR(-EMSGSIZE); local_bh_disable(); skb = napi_get_frags(&tfile->napi); From e7579d5d5b3298f7e888ed07ac16bfb7174c135a Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 21 Dec 2020 22:07:25 +0100 Subject: [PATCH 206/547] net: mptcp: cap forward allocation to 1M the following syzkaller reproducer: r0 = socket$inet_mptcp(0x2, 0x1, 0x106) bind$inet(r0, &(0x7f0000000080)={0x2, 0x4e24, @multicast2}, 0x10) connect$inet(r0, &(0x7f0000000480)={0x2, 0x4e24, @local}, 0x10) sendto$inet(r0, &(0x7f0000000100)="f6", 0xffffffe7, 0xc000, 0x0, 0x0) systematically triggers the following warning: WARNING: CPU: 2 PID: 8618 at net/core/stream.c:208 sk_stream_kill_queues+0x3fa/0x580 Modules linked in: CPU: 2 PID: 8618 Comm: syz-executor Not tainted 5.10.0+ #334 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/04 RIP: 0010:sk_stream_kill_queues+0x3fa/0x580 Code: df 48 c1 ea 03 0f b6 04 02 84 c0 74 04 3c 03 7e 40 8b ab 20 02 00 00 e9 64 ff ff ff e8 df f0 81 2 RSP: 0018:ffffc9000290fcb0 EFLAGS: 00010293 RAX: ffff888011cb8000 RBX: 0000000000000000 RCX: ffffffff86eecf0e RDX: 0000000000000000 RSI: ffffffff86eecf6a RDI: 0000000000000005 RBP: 0000000000000e28 R08: ffff888011cb8000 R09: fffffbfff1f48139 R10: ffffffff8fa409c7 R11: fffffbfff1f48138 R12: ffff8880215e6220 R13: ffffffff8fa409c0 R14: ffffc9000290fd30 R15: 1ffff92000521fa2 FS: 00007f41c78f4800(0000) GS:ffff88802d000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f95c803d088 CR3: 0000000025ed2000 CR4: 00000000000006f0 Call Trace: __mptcp_destroy_sock+0x4f5/0x8e0 mptcp_close+0x5e2/0x7f0 inet_release+0x12b/0x270 __sock_release+0xc8/0x270 sock_close+0x18/0x20 __fput+0x272/0x8e0 task_work_run+0xe0/0x1a0 exit_to_user_mode_prepare+0x1df/0x200 syscall_exit_to_user_mode+0x19/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9 userspace programs provide arbitrarily high values of 'len' in sendmsg(): this is causing integer overflow of 'amount'. Cap forward allocation to 1 megabyte: higher values are not really useful. Suggested-by: Paolo Abeni Fixes: e93da92896bc ("mptcp: implement wmem reservation") Signed-off-by: Davide Caratti Link: https://lore.kernel.org/r/3334d00d8b2faecafdfab9aa593efcbf61442756.1608584474.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 09b19aa2f205..6628d8d74203 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -877,6 +877,9 @@ static void __mptcp_wmem_reserve(struct sock *sk, int size) struct mptcp_sock *msk = mptcp_sk(sk); WARN_ON_ONCE(msk->wmem_reserved); + if (WARN_ON_ONCE(amount < 0)) + amount = 0; + if (amount <= sk->sk_forward_alloc) goto reserve; @@ -1587,7 +1590,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; - mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, len)); + mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len))); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); From fb1e6e562b37b39adfe251919c9abfdb3e01f921 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Sun, 27 Dec 2020 14:18:17 -0500 Subject: [PATCH 207/547] bnxt_en: Fix AER recovery. A recent change skips sending firmware messages to the firmware when pci_channel_offline() is true during fatal AER error. To make this complete, we need to move the re-initialization sequence to bnxt_io_resume(), otherwise the firmware messages to re-initialize will all be skipped. In any case, it is more correct to re-initialize in bnxt_io_resume(). Also, fix the reverse x-mas tree format when defining variables in bnxt_io_slot_reset(). Fixes: b340dc680ed4 ("bnxt_en: Avoid sending firmware messages when AER error is detected.") Reviewed-by: Edwin Peer Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 31 ++++++++++------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 4edd6f8e017e..b8351e24395d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -12887,10 +12887,10 @@ static pci_ers_result_t bnxt_io_error_detected(struct pci_dev *pdev, */ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev) { + pci_ers_result_t result = PCI_ERS_RESULT_DISCONNECT; struct net_device *netdev = pci_get_drvdata(pdev); struct bnxt *bp = netdev_priv(netdev); int err = 0, off; - pci_ers_result_t result = PCI_ERS_RESULT_DISCONNECT; netdev_info(bp->dev, "PCI Slot Reset\n"); @@ -12919,22 +12919,8 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev) pci_save_state(pdev); err = bnxt_hwrm_func_reset(bp); - if (!err) { - err = bnxt_hwrm_func_qcaps(bp); - if (!err && netif_running(netdev)) - err = bnxt_open(netdev); - } - bnxt_ulp_start(bp, err); - if (!err) { - bnxt_reenable_sriov(bp); + if (!err) result = PCI_ERS_RESULT_RECOVERED; - } - } - - if (result != PCI_ERS_RESULT_RECOVERED) { - if (netif_running(netdev)) - dev_close(netdev); - pci_disable_device(pdev); } rtnl_unlock(); @@ -12952,10 +12938,21 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev) static void bnxt_io_resume(struct pci_dev *pdev) { struct net_device *netdev = pci_get_drvdata(pdev); + struct bnxt *bp = netdev_priv(netdev); + int err; + netdev_info(bp->dev, "PCI Slot Resume\n"); rtnl_lock(); - netif_device_attach(netdev); + err = bnxt_hwrm_func_qcaps(bp); + if (!err && netif_running(netdev)) + err = bnxt_open(netdev); + + bnxt_ulp_start(bp, err); + if (!err) { + bnxt_reenable_sriov(bp); + netif_device_attach(netdev); + } rtnl_unlock(); } From a029a2fef5d11bb85587433c3783615442abac96 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 27 Dec 2020 14:18:18 -0500 Subject: [PATCH 208/547] bnxt_en: Check TQM rings for maximum supported value. TQM rings are hardware resources that require host context memory managed by the driver. The driver supports up to 9 TQM rings and the number of rings to use is requested by firmware during run-time. Cap this number to the maximum supported to prevent accessing beyond the array. Future firmware may request more than 9 TQM rings. Define macros to remove the magic number 9 from the C code. Fixes: ac3158cb0108 ("bnxt_en: Allocate TQM ring context memory according to fw specification.") Reviewed-by: Pavan Chebbi Reviewed-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 7 +++++-- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 7 ++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index b8351e24395d..d10e4f85dd11 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6790,8 +6790,10 @@ static int bnxt_hwrm_func_backing_store_qcaps(struct bnxt *bp) ctx->tqm_fp_rings_count = resp->tqm_fp_rings_count; if (!ctx->tqm_fp_rings_count) ctx->tqm_fp_rings_count = bp->max_q; + else if (ctx->tqm_fp_rings_count > BNXT_MAX_TQM_FP_RINGS) + ctx->tqm_fp_rings_count = BNXT_MAX_TQM_FP_RINGS; - tqm_rings = ctx->tqm_fp_rings_count + 1; + tqm_rings = ctx->tqm_fp_rings_count + BNXT_MAX_TQM_SP_RINGS; ctx_pg = kcalloc(tqm_rings, sizeof(*ctx_pg), GFP_KERNEL); if (!ctx_pg) { kfree(ctx); @@ -6925,7 +6927,8 @@ static int bnxt_hwrm_func_backing_store_cfg(struct bnxt *bp, u32 enables) pg_attr = &req.tqm_sp_pg_size_tqm_sp_lvl, pg_dir = &req.tqm_sp_page_dir, ena = FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_SP; - i < 9; i++, num_entries++, pg_attr++, pg_dir++, ena <<= 1) { + i < BNXT_MAX_TQM_RINGS; + i++, num_entries++, pg_attr++, pg_dir++, ena <<= 1) { if (!(enables & ena)) continue; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 950ea26ae0d2..51996c85547e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1436,6 +1436,11 @@ struct bnxt_ctx_pg_info { struct bnxt_ctx_pg_info **ctx_pg_tbl; }; +#define BNXT_MAX_TQM_SP_RINGS 1 +#define BNXT_MAX_TQM_FP_RINGS 8 +#define BNXT_MAX_TQM_RINGS \ + (BNXT_MAX_TQM_SP_RINGS + BNXT_MAX_TQM_FP_RINGS) + struct bnxt_ctx_mem_info { u32 qp_max_entries; u16 qp_min_qp1_entries; @@ -1474,7 +1479,7 @@ struct bnxt_ctx_mem_info { struct bnxt_ctx_pg_info stat_mem; struct bnxt_ctx_pg_info mrav_mem; struct bnxt_ctx_pg_info tim_mem; - struct bnxt_ctx_pg_info *tqm_mem[9]; + struct bnxt_ctx_pg_info *tqm_mem[BNXT_MAX_TQM_RINGS]; }; struct bnxt_fw_health { From 1169318bd565d2911b949f6123e109baa35881b6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 26 Dec 2020 15:37:36 -0600 Subject: [PATCH 209/547] net: ipa: don't return a value from gsi_channel_command() Callers of gsi_channel_command() no longer care whether the command times out, and don't use what gsi_channel_command() returns. Redefine that function to have void return type. Reported-by: kernel test robot Fixes: 6ffddf3b3d182 ("net: ipa: use state to determine channel command success") Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- drivers/net/ipa/gsi.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index 579cc3e51677..e51a77057899 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -454,7 +454,7 @@ static enum gsi_channel_state gsi_channel_state(struct gsi_channel *channel) } /* Issue a channel command and wait for it to complete */ -static int +static void gsi_channel_command(struct gsi_channel *channel, enum gsi_ch_cmd_opcode opcode) { struct completion *completion = &channel->completion; @@ -489,12 +489,10 @@ gsi_channel_command(struct gsi_channel *channel, enum gsi_ch_cmd_opcode opcode) iowrite32(0, gsi->virt + GSI_CNTXT_SRC_CH_IRQ_MSK_OFFSET); if (success) - return 0; + return; dev_err(dev, "GSI command %u for channel %u timed out, state %u\n", opcode, channel_id, gsi_channel_state(channel)); - - return -ETIMEDOUT; } /* Allocate GSI channel in NOT_ALLOCATED state */ @@ -503,7 +501,6 @@ static int gsi_channel_alloc_command(struct gsi *gsi, u32 channel_id) struct gsi_channel *channel = &gsi->channel[channel_id]; struct device *dev = gsi->dev; enum gsi_channel_state state; - int ret; /* Get initial channel state */ state = gsi_channel_state(channel); @@ -513,7 +510,7 @@ static int gsi_channel_alloc_command(struct gsi *gsi, u32 channel_id) return -EINVAL; } - ret = gsi_channel_command(channel, GSI_CH_ALLOCATE); + gsi_channel_command(channel, GSI_CH_ALLOCATE); /* If successful the channel state will have changed */ state = gsi_channel_state(channel); @@ -531,7 +528,6 @@ static int gsi_channel_start_command(struct gsi_channel *channel) { struct device *dev = channel->gsi->dev; enum gsi_channel_state state; - int ret; state = gsi_channel_state(channel); if (state != GSI_CHANNEL_STATE_ALLOCATED && @@ -541,7 +537,7 @@ static int gsi_channel_start_command(struct gsi_channel *channel) return -EINVAL; } - ret = gsi_channel_command(channel, GSI_CH_START); + gsi_channel_command(channel, GSI_CH_START); /* If successful the channel state will have changed */ state = gsi_channel_state(channel); @@ -559,7 +555,6 @@ static int gsi_channel_stop_command(struct gsi_channel *channel) { struct device *dev = channel->gsi->dev; enum gsi_channel_state state; - int ret; state = gsi_channel_state(channel); @@ -576,7 +571,7 @@ static int gsi_channel_stop_command(struct gsi_channel *channel) return -EINVAL; } - ret = gsi_channel_command(channel, GSI_CH_STOP); + gsi_channel_command(channel, GSI_CH_STOP); /* If successful the channel state will have changed */ state = gsi_channel_state(channel); @@ -598,7 +593,6 @@ static void gsi_channel_reset_command(struct gsi_channel *channel) { struct device *dev = channel->gsi->dev; enum gsi_channel_state state; - int ret; msleep(1); /* A short delay is required before a RESET command */ @@ -612,7 +606,7 @@ static void gsi_channel_reset_command(struct gsi_channel *channel) return; } - ret = gsi_channel_command(channel, GSI_CH_RESET); + gsi_channel_command(channel, GSI_CH_RESET); /* If successful the channel state will have changed */ state = gsi_channel_state(channel); @@ -627,7 +621,6 @@ static void gsi_channel_de_alloc_command(struct gsi *gsi, u32 channel_id) struct gsi_channel *channel = &gsi->channel[channel_id]; struct device *dev = gsi->dev; enum gsi_channel_state state; - int ret; state = gsi_channel_state(channel); if (state != GSI_CHANNEL_STATE_ALLOCATED) { @@ -636,7 +629,7 @@ static void gsi_channel_de_alloc_command(struct gsi *gsi, u32 channel_id) return; } - ret = gsi_channel_command(channel, GSI_CH_DE_ALLOC); + gsi_channel_command(channel, GSI_CH_DE_ALLOC); /* If successful the channel state will have changed */ state = gsi_channel_state(channel); From 1ddf776b498c922935d0ec3283b9817dd33aedf7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 26 Dec 2020 15:37:37 -0600 Subject: [PATCH 210/547] net: ipa: don't return a value from evt_ring_command() Callers of evt_ring_command() no longer care whether the command times out, and don't use what evt_ring_command() returns. Redefine that function to have void return type. Reported-by: kernel test robot Fixes: 428b448ee764a ("net: ipa: use state to determine event ring command success") Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- drivers/net/ipa/gsi.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index e51a77057899..14d9a791924b 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -326,8 +326,8 @@ gsi_evt_ring_state(struct gsi *gsi, u32 evt_ring_id) } /* Issue an event ring command and wait for it to complete */ -static int evt_ring_command(struct gsi *gsi, u32 evt_ring_id, - enum gsi_evt_cmd_opcode opcode) +static void evt_ring_command(struct gsi *gsi, u32 evt_ring_id, + enum gsi_evt_cmd_opcode opcode) { struct gsi_evt_ring *evt_ring = &gsi->evt_ring[evt_ring_id]; struct completion *completion = &evt_ring->completion; @@ -361,19 +361,16 @@ static int evt_ring_command(struct gsi *gsi, u32 evt_ring_id, iowrite32(0, gsi->virt + GSI_CNTXT_SRC_EV_CH_IRQ_MSK_OFFSET); if (success) - return 0; + return; dev_err(dev, "GSI command %u for event ring %u timed out, state %u\n", opcode, evt_ring_id, evt_ring->state); - - return -ETIMEDOUT; } /* Allocate an event ring in NOT_ALLOCATED state */ static int gsi_evt_ring_alloc_command(struct gsi *gsi, u32 evt_ring_id) { struct gsi_evt_ring *evt_ring = &gsi->evt_ring[evt_ring_id]; - int ret; /* Get initial event ring state */ evt_ring->state = gsi_evt_ring_state(gsi, evt_ring_id); @@ -383,7 +380,7 @@ static int gsi_evt_ring_alloc_command(struct gsi *gsi, u32 evt_ring_id) return -EINVAL; } - ret = evt_ring_command(gsi, evt_ring_id, GSI_EVT_ALLOCATE); + evt_ring_command(gsi, evt_ring_id, GSI_EVT_ALLOCATE); /* If successful the event ring state will have changed */ if (evt_ring->state == GSI_EVT_RING_STATE_ALLOCATED) @@ -400,7 +397,6 @@ static void gsi_evt_ring_reset_command(struct gsi *gsi, u32 evt_ring_id) { struct gsi_evt_ring *evt_ring = &gsi->evt_ring[evt_ring_id]; enum gsi_evt_ring_state state = evt_ring->state; - int ret; if (state != GSI_EVT_RING_STATE_ALLOCATED && state != GSI_EVT_RING_STATE_ERROR) { @@ -409,7 +405,7 @@ static void gsi_evt_ring_reset_command(struct gsi *gsi, u32 evt_ring_id) return; } - ret = evt_ring_command(gsi, evt_ring_id, GSI_EVT_RESET); + evt_ring_command(gsi, evt_ring_id, GSI_EVT_RESET); /* If successful the event ring state will have changed */ if (evt_ring->state == GSI_EVT_RING_STATE_ALLOCATED) @@ -423,7 +419,6 @@ static void gsi_evt_ring_reset_command(struct gsi *gsi, u32 evt_ring_id) static void gsi_evt_ring_de_alloc_command(struct gsi *gsi, u32 evt_ring_id) { struct gsi_evt_ring *evt_ring = &gsi->evt_ring[evt_ring_id]; - int ret; if (evt_ring->state != GSI_EVT_RING_STATE_ALLOCATED) { dev_err(gsi->dev, "event ring %u state %u before dealloc\n", @@ -431,7 +426,7 @@ static void gsi_evt_ring_de_alloc_command(struct gsi *gsi, u32 evt_ring_id) return; } - ret = evt_ring_command(gsi, evt_ring_id, GSI_EVT_DE_ALLOC); + evt_ring_command(gsi, evt_ring_id, GSI_EVT_DE_ALLOC); /* If successful the event ring state will have changed */ if (evt_ring->state == GSI_EVT_RING_STATE_NOT_ALLOCATED) From 4f374d2c43a9e5e773f1dee56db63bd6b8a36276 Mon Sep 17 00:00:00 2001 From: Stefan Chulski Date: Wed, 23 Dec 2020 20:35:21 +0200 Subject: [PATCH 211/547] net: mvpp2: fix pkt coalescing int-threshold configuration The packet coalescing interrupt threshold has separated registers for different aggregated/cpu (sw-thread). The required value should be loaded for every thread but not only for 1 current cpu. Fixes: 213f428f5056 ("net: mvpp2: add support for TX interrupts and RX queue distribution modes") Signed-off-by: Stefan Chulski Link: https://lore.kernel.org/r/1608748521-11033-1-git-send-email-stefanc@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index f20b31327027..4b1808acef58 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -2370,17 +2370,18 @@ static void mvpp2_rx_pkts_coal_set(struct mvpp2_port *port, static void mvpp2_tx_pkts_coal_set(struct mvpp2_port *port, struct mvpp2_tx_queue *txq) { - unsigned int thread = mvpp2_cpu_to_thread(port->priv, get_cpu()); + unsigned int thread; u32 val; if (txq->done_pkts_coal > MVPP2_TXQ_THRESH_MASK) txq->done_pkts_coal = MVPP2_TXQ_THRESH_MASK; val = (txq->done_pkts_coal << MVPP2_TXQ_THRESH_OFFSET); - mvpp2_thread_write(port->priv, thread, MVPP2_TXQ_NUM_REG, txq->id); - mvpp2_thread_write(port->priv, thread, MVPP2_TXQ_THRESH_REG, val); - - put_cpu(); + /* PKT-coalescing registers are per-queue + per-thread */ + for (thread = 0; thread < MVPP2_MAX_THREADS; thread++) { + mvpp2_thread_write(port->priv, thread, MVPP2_TXQ_NUM_REG, txq->id); + mvpp2_thread_write(port->priv, thread, MVPP2_TXQ_THRESH_REG, val); + } } static u32 mvpp2_usec_to_cycles(u32 usec, unsigned long clk_hz) From 21fdca22eb7df2a1e194b8adb812ce370748b733 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 24 Dec 2020 20:01:09 +0100 Subject: [PATCH 212/547] ipv4: Ignore ECN bits for fib lookups in fib_compute_spec_dst() RT_TOS() only clears one of the ECN bits. Therefore, when fib_compute_spec_dst() resorts to a fib lookup, it can return different results depending on the value of the second ECN bit. For example, ECT(0) and ECT(1) packets could be treated differently. $ ip netns add ns0 $ ip netns add ns1 $ ip link add name veth01 netns ns0 type veth peer name veth10 netns ns1 $ ip -netns ns0 link set dev lo up $ ip -netns ns1 link set dev lo up $ ip -netns ns0 link set dev veth01 up $ ip -netns ns1 link set dev veth10 up $ ip -netns ns0 address add 192.0.2.10/24 dev veth01 $ ip -netns ns1 address add 192.0.2.11/24 dev veth10 $ ip -netns ns1 address add 192.0.2.21/32 dev lo $ ip -netns ns1 route add 192.0.2.10/32 tos 4 dev veth10 src 192.0.2.21 $ ip netns exec ns1 sysctl -wq net.ipv4.icmp_echo_ignore_broadcasts=0 With TOS 4 and ECT(1), ns1 replies using source address 192.0.2.21 (ping uses -Q to set all TOS and ECN bits): $ ip netns exec ns0 ping -c 1 -b -Q 5 192.0.2.255 [...] 64 bytes from 192.0.2.21: icmp_seq=1 ttl=64 time=0.544 ms But with TOS 4 and ECT(0), ns1 replies using source address 192.0.2.11 because the "tos 4" route isn't matched: $ ip netns exec ns0 ping -c 1 -b -Q 6 192.0.2.255 [...] 64 bytes from 192.0.2.11: icmp_seq=1 ttl=64 time=0.597 ms After this patch the ECN bits don't affect the result anymore: $ ip netns exec ns0 ping -c 1 -b -Q 6 192.0.2.255 [...] 64 bytes from 192.0.2.21: icmp_seq=1 ttl=64 time=0.591 ms Fixes: 35ebf65e851c ("ipv4: Create and use fib_compute_spec_dst() helper.") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cdf6ec5aa45d..84bb707bd88d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -292,7 +292,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_oif = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), + .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; From a533b70a657c03137dd49cbcfee70aac086ab2b1 Mon Sep 17 00:00:00 2001 From: weichenchen Date: Fri, 25 Dec 2020 13:44:45 +0800 Subject: [PATCH 213/547] net: neighbor: fix a crash caused by mod zero pneigh_enqueue() tries to obtain a random delay by mod NEIGH_VAR(p, PROXY_DELAY). However, NEIGH_VAR(p, PROXY_DELAY) migth be zero at that point because someone could write zero to /proc/sys/net/ipv4/neigh/[device]/proxy_delay after the callers check it. This patch uses prandom_u32_max() to get a random delay instead which avoids potential division by zero. Signed-off-by: weichenchen Signed-off-by: David S. Miller --- net/core/neighbour.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9500d28a43b0..277ed854aef1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1569,10 +1569,8 @@ static void neigh_proxy_process(struct timer_list *t) void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { - unsigned long now = jiffies; - - unsigned long sched_next = now + (prandom_u32() % - NEIGH_VAR(p, PROXY_DELAY)); + unsigned long sched_next = jiffies + + prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY)); if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); From bd1248f1ddbc48b0c30565fce897a3b6423313b8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 24 Dec 2020 22:23:44 -0800 Subject: [PATCH 214/547] net: sched: prevent invalid Scell_log shift count Check Scell_log shift size in red_check_params() and modify all callers of red_check_params() to pass Scell_log. This prevents a shift out-of-bounds as detected by UBSAN: UBSAN: shift-out-of-bounds in ./include/net/red.h:252:22 shift exponent 72 is too large for 32-bit type 'int' Fixes: 8afa10cbe281 ("net_sched: red: Avoid illegal values") Signed-off-by: Randy Dunlap Reported-by: syzbot+97c5bd9cc81eca63d36e@syzkaller.appspotmail.com Cc: Nogah Frankel Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: netdev@vger.kernel.org Cc: "David S. Miller" Cc: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/red.h | 4 +++- net/sched/sch_choke.c | 2 +- net/sched/sch_gred.c | 2 +- net/sched/sch_red.c | 2 +- net/sched/sch_sfq.c | 2 +- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/net/red.h b/include/net/red.h index fc455445f4b2..932f0d79d60c 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -168,12 +168,14 @@ static inline void red_set_vars(struct red_vars *v) v->qcount = -1; } -static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog) +static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, u8 Scell_log) { if (fls(qth_min) + Wlog > 32) return false; if (fls(qth_max) + Wlog > 32) return false; + if (Scell_log >= 32) + return false; if (qth_max < qth_min) return false; return true; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index bd618b00d319..50f680f03a54 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -362,7 +362,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, ctl = nla_data(tb[TCA_CHOKE_PARMS]); - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) return -EINVAL; if (ctl->limit > CHOKE_MAX_QUEUE) diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 8599c6f31b05..e0bc77533acc 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -480,7 +480,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) { + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) { NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters"); return -EINVAL; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index e89fab6ccb34..b4ae34d7aa96 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -250,7 +250,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; ctl = nla_data(tb[TCA_RED_PARMS]); - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) return -EINVAL; err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS, diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index bca2be57d9fc..b25e51440623 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -647,7 +647,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) } if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, - ctl_v1->Wlog)) + ctl_v1->Wlog, ctl_v1->Scell_log)) return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { p = kmalloc(sizeof(*p), GFP_KERNEL); From 5ede3ada3da7f050519112b81badc058190b9f9f Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Sat, 26 Dec 2020 16:10:05 +0800 Subject: [PATCH 215/547] net: hns: fix return value check in __lb_other_process() The function skb_copy() could return NULL, the return value need to be checked. Fixes: b5996f11ea54 ("net: add Hisilicon Network Subsystem basic ethernet support") Signed-off-by: Yunjian Wang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c index 7165da0ee9aa..a6e3f07caf99 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c @@ -415,6 +415,10 @@ static void __lb_other_process(struct hns_nic_ring_data *ring_data, /* for mutl buffer*/ new_skb = skb_copy(skb, GFP_ATOMIC); dev_kfree_skb_any(skb); + if (!new_skb) { + netdev_err(ndev, "skb alloc failed\n"); + return; + } skb = new_skb; check_ok = 0; From 085c7c4e1c0e50d90b7d90f61a12e12b317a91e2 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 26 Dec 2020 15:44:53 -0800 Subject: [PATCH 216/547] erspan: fix version 1 check in gre_parse_header() Both version 0 and version 1 use ETH_P_ERSPAN, but version 0 does not have an erspan header. So the check in gre_parse_header() is wrong, we have to distinguish version 1 from version 0. We can just check the gre header length like is_erspan_type1(). Fixes: cb73ee40b1b3 ("net: ip_gre: use erspan key field for tunnel lookup") Reported-by: syzbot+f583ce3d4ddf9836b27a@syzkaller.appspotmail.com Cc: William Tu Cc: Lorenzo Bianconi Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/gre_demux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 66fdbfe5447c..5d1e6fe9d838 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -128,7 +128,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, * to 0 and sets the configured key in the * inner erspan header field */ - if (greh->protocol == htons(ETH_P_ERSPAN) || + if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) || greh->protocol == htons(ETH_P_ERSPAN2)) { struct erspan_base_hdr *ershdr; From 9b22fece786ed641909988da4810bfa8e5d2e592 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Le=20Bouter?= Date: Sun, 27 Dec 2020 17:11:36 +0100 Subject: [PATCH 217/547] atlantic: remove architecture depends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was tested on a RaptorCS Talos II with IBM POWER9 DD2.2 CPUs and an ASUS XG-C100F PCI-e card without any issue. Speeds of ~8Gbps could be attained with not-very-scientific (wget HTTP) both-ways measurements on a local network. No warning or error reported in kernel logs. The drivers seems to be portable enough for it not to be gated like such. Signed-off-by: Léo Le Bouter Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/Kconfig b/drivers/net/ethernet/aquantia/Kconfig index efb33c078a3c..cec2018c84a9 100644 --- a/drivers/net/ethernet/aquantia/Kconfig +++ b/drivers/net/ethernet/aquantia/Kconfig @@ -19,7 +19,6 @@ if NET_VENDOR_AQUANTIA config AQTION tristate "aQuantia AQtion(tm) Support" depends on PCI - depends on X86_64 || ARM64 || COMPILE_TEST depends on MACSEC || MACSEC=n help This enables the support for the aQuantia AQtion(tm) Ethernet card. From 1fef73597fa545c35fddc953979013882fbd4e55 Mon Sep 17 00:00:00 2001 From: Xie He Date: Sun, 27 Dec 2020 18:53:39 -0800 Subject: [PATCH 218/547] net: hdlc_ppp: Fix issues when mod_timer is called while timer is running ppp_cp_event is called directly or indirectly by ppp_rx with "ppp->lock" held. It may call mod_timer to add a new timer. However, at the same time ppp_timer may be already running and waiting for "ppp->lock". In this case, there's no need for ppp_timer to continue running and it can just exit. If we let ppp_timer continue running, it may call add_timer. This causes kernel panic because add_timer can't be called with a timer pending. This patch fixes this problem. Fixes: e022c2f07ae5 ("WAN: new synchronous PPP implementation for generic HDLC.") Cc: Krzysztof Halasa Signed-off-by: Xie He Signed-off-by: David S. Miller --- drivers/net/wan/hdlc_ppp.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/wan/hdlc_ppp.c b/drivers/net/wan/hdlc_ppp.c index 64f855651336..261b53fc8e04 100644 --- a/drivers/net/wan/hdlc_ppp.c +++ b/drivers/net/wan/hdlc_ppp.c @@ -569,6 +569,13 @@ static void ppp_timer(struct timer_list *t) unsigned long flags; spin_lock_irqsave(&ppp->lock, flags); + /* mod_timer could be called after we entered this function but + * before we got the lock. + */ + if (timer_pending(&proto->timer)) { + spin_unlock_irqrestore(&ppp->lock, flags); + return; + } switch (proto->state) { case STOPPING: case REQ_SENT: From 26b614fa441048a9f8e4a814c3b01756816ce7a7 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 16 Dec 2020 17:48:33 +0200 Subject: [PATCH 219/547] dmaengine: ti: k3-udma: Fix pktdma rchan TPL level setup Instead of initializing the rchan_tpl the initial commit re-initialized the tchan_tpl. Fixes: d2abc982333c0 ("dmaengine: ti: k3-udma: Initial support for K3 PKTDMA") Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20201216154833.20821-1-peter.ujfalusi@ti.com Signed-off-by: Vinod Koul --- drivers/dma/ti/k3-udma.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c index 87157cbae1b8..298460438bb4 100644 --- a/drivers/dma/ti/k3-udma.c +++ b/drivers/dma/ti/k3-udma.c @@ -4698,9 +4698,9 @@ static int pktdma_setup_resources(struct udma_dev *ud) ud->tchan_tpl.levels = 1; } - ud->tchan_tpl.levels = ud->tchan_tpl.levels; - ud->tchan_tpl.start_idx[0] = ud->tchan_tpl.start_idx[0]; - ud->tchan_tpl.start_idx[1] = ud->tchan_tpl.start_idx[1]; + ud->rchan_tpl.levels = ud->tchan_tpl.levels; + ud->rchan_tpl.start_idx[0] = ud->tchan_tpl.start_idx[0]; + ud->rchan_tpl.start_idx[1] = ud->tchan_tpl.start_idx[1]; ud->tchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->tchan_cnt), sizeof(unsigned long), GFP_KERNEL); From ff58f7dd0c1352a01de3a40327895bd51e03de3a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 16 Dec 2020 11:29:46 +0300 Subject: [PATCH 220/547] dmaengine: idxd: off by one in cleanup code The clean up is off by one so this will start at "i" and it should start with "i - 1" and then it doesn't unregister the zeroeth elements in the array. Fixes: c52ca478233c ("dmaengine: idxd: add configuration component of driver") Signed-off-by: Dan Carpenter Acked-by: Dave Jiang Link: https://lore.kernel.org/r/X9nFeojulsNqUSnG@mwanda Signed-off-by: Vinod Koul --- drivers/dma/idxd/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 266423a2cabc..4dbb03c545e4 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -434,7 +434,7 @@ int idxd_register_driver(void) return 0; drv_fail: - for (; i > 0; i--) + while (--i >= 0) driver_unregister(&idxd_drvs[i]->drv); return rc; } @@ -1840,7 +1840,7 @@ int idxd_register_bus_type(void) return 0; bus_err: - for (; i > 0; i--) + while (--i >= 0) bus_unregister(idxd_bus_types[i]); return rc; } From 8fb28795fb64e1151c0e713686d8b026a5a2aece Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 18 Dec 2020 18:41:37 +0800 Subject: [PATCH 221/547] dmaengine: qcom: gpi: Fixes a format mismatch drivers/dma/qcom/gpi.c:1419:3: warning: format '%lu' expects argument of type 'long unsigned int', but argument 8 has type 'size_t {aka unsigned int}' [-Wformat=] drivers/dma/qcom/gpi.c:1427:31: warning: format '%lu' expects argument of type 'long unsigned int', but argument 3 has type 'size_t {aka unsigned int}' [-Wformat=] drivers/dma/qcom/gpi.c:1447:3: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 4 has type 'dma_addr_t {aka unsigned int}' [-Wformat=] drivers/dma/qcom/gpi.c:1447:3: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type 'phys_addr_t {aka unsigned int}' [-Wformat=] Signed-off-by: Xiaoming Ni Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20201218104137.59200-1-nixiaoming@huawei.com Signed-off-by: Vinod Koul --- drivers/dma/qcom/gpi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c index d2334f535de2..556c070a514c 100644 --- a/drivers/dma/qcom/gpi.c +++ b/drivers/dma/qcom/gpi.c @@ -1416,7 +1416,7 @@ static int gpi_alloc_ring(struct gpi_ring *ring, u32 elements, len = 1 << bit; ring->alloc_size = (len + (len - 1)); dev_dbg(gpii->gpi_dev->dev, - "#el:%u el_size:%u len:%u actual_len:%llu alloc_size:%lu\n", + "#el:%u el_size:%u len:%u actual_len:%llu alloc_size:%zu\n", elements, el_size, (elements * el_size), len, ring->alloc_size); @@ -1424,7 +1424,7 @@ static int gpi_alloc_ring(struct gpi_ring *ring, u32 elements, ring->alloc_size, &ring->dma_handle, GFP_KERNEL); if (!ring->pre_aligned) { - dev_err(gpii->gpi_dev->dev, "could not alloc size:%lu mem for ring\n", + dev_err(gpii->gpi_dev->dev, "could not alloc size:%zu mem for ring\n", ring->alloc_size); return -ENOMEM; } @@ -1444,8 +1444,8 @@ static int gpi_alloc_ring(struct gpi_ring *ring, u32 elements, smp_wmb(); dev_dbg(gpii->gpi_dev->dev, - "phy_pre:0x%0llx phy_alig:0x%0llx len:%u el_size:%u elements:%u\n", - ring->dma_handle, ring->phys_addr, ring->len, + "phy_pre:%pad phy_alig:%pa len:%u el_size:%u elements:%u\n", + &ring->dma_handle, &ring->phys_addr, ring->len, ring->el_size, ring->elements); return 0; From 33cbd54dc515cc04b5a603603414222b4bb1448d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 19 Dec 2020 13:47:18 +0100 Subject: [PATCH 222/547] dmaengine: mediatek: mtk-hsdma: Fix a resource leak in the error handling path of the probe function 'mtk_hsdma_hw_deinit()' should be called in the error handling path of the probe function to undo a previous 'mtk_hsdma_hw_init()' call, as already done in the remove function. Fixes: 548c4597e984 ("dmaengine: mediatek: Add MediaTek High-Speed DMA controller for MT7622 and MT7623 SoC") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/20201219124718.182664-1-christophe.jaillet@wanadoo.fr Signed-off-by: Vinod Koul --- drivers/dma/mediatek/mtk-hsdma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma/mediatek/mtk-hsdma.c b/drivers/dma/mediatek/mtk-hsdma.c index f133ae8dece1..6ad8afbb95f2 100644 --- a/drivers/dma/mediatek/mtk-hsdma.c +++ b/drivers/dma/mediatek/mtk-hsdma.c @@ -1007,6 +1007,7 @@ static int mtk_hsdma_probe(struct platform_device *pdev) return 0; err_free: + mtk_hsdma_hw_deinit(hsdma); of_dma_controller_free(pdev->dev.of_node); err_unregister: dma_async_device_unregister(dd); From d645148cc82ca7fbacaa601414a552184e9c6dd3 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 19 Dec 2020 14:28:00 +0100 Subject: [PATCH 223/547] dmaengine: milbeaut-xdmac: Fix a resource leak in the error handling path of the probe function 'disable_xdmac()' should be called in the error handling path of the probe function to undo a previous 'enable_xdmac()' call, as already done in the remove function. Fixes: a6e9be055d47 ("dmaengine: milbeaut-xdmac: Add XDMAC driver for Milbeaut platforms") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/20201219132800.183254-1-christophe.jaillet@wanadoo.fr Signed-off-by: Vinod Koul --- drivers/dma/milbeaut-xdmac.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/dma/milbeaut-xdmac.c b/drivers/dma/milbeaut-xdmac.c index 584c931e807a..d29d01e730aa 100644 --- a/drivers/dma/milbeaut-xdmac.c +++ b/drivers/dma/milbeaut-xdmac.c @@ -350,7 +350,7 @@ static int milbeaut_xdmac_probe(struct platform_device *pdev) ret = dma_async_device_register(ddev); if (ret) - return ret; + goto disable_xdmac; ret = of_dma_controller_register(dev->of_node, of_dma_simple_xlate, mdev); @@ -363,6 +363,8 @@ static int milbeaut_xdmac_probe(struct platform_device *pdev) unregister_dmac: dma_async_device_unregister(ddev); +disable_xdmac: + disable_xdmac(mdev); return ret; } From 595a334148449bd1d27cf5d6fcb3b0d718cb1b9f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 14 Dec 2020 14:56:52 +0300 Subject: [PATCH 224/547] dmaengine: dw-edma: Fix use after free in dw_edma_alloc_chunk() If the dw_edma_alloc_burst() function fails then we free "chunk" but it's still on the "desc->chunk->list" list so it will lead to a use after free. Also the "->chunks_alloc" count is incremented when it shouldn't be. In current kernels small allocations are guaranteed to succeed and dw_edma_alloc_burst() can't fail so this will not actually affect runtime. Fixes: e63d79d1ffcd ("dmaengine: Add Synopsys eDMA IP core driver") Signed-off-by: Dan Carpenter Acked-by: Gustavo Pimentel Link: https://lore.kernel.org/r/X9dTBFrUPEvvW7qc@mwanda Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index b971505b8715..08d71dafa001 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -86,12 +86,12 @@ static struct dw_edma_chunk *dw_edma_alloc_chunk(struct dw_edma_desc *desc) if (desc->chunk) { /* Create and add new element into the linked list */ - desc->chunks_alloc++; - list_add_tail(&chunk->list, &desc->chunk->list); if (!dw_edma_alloc_burst(chunk)) { kfree(chunk); return NULL; } + desc->chunks_alloc++; + list_add_tail(&chunk->list, &desc->chunk->list); } else { /* List head */ chunk->burst = NULL; From ba42f61b36121730d7f51cc261dfd744ee19f50b Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Wed, 16 Dec 2020 21:06:49 +0800 Subject: [PATCH 225/547] qcom: bam_dma: Delete useless kfree code The parameter of kfree function is NULL, so kfree code is useless, delete it. Therefore, goto expression is no longer needed, so simplify it. Signed-off-by: Zheng Yongjun Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20201216130649.13979-1-zhengyongjun3@huawei.com Signed-off-by: Vinod Koul --- drivers/dma/qcom/bam_dma.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c index d5773d474d8f..88579857ca1d 100644 --- a/drivers/dma/qcom/bam_dma.c +++ b/drivers/dma/qcom/bam_dma.c @@ -630,7 +630,7 @@ static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan, GFP_NOWAIT); if (!async_desc) - goto err_out; + return NULL; if (flags & DMA_PREP_FENCE) async_desc->flags |= DESC_FLAG_NWD; @@ -670,10 +670,6 @@ static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan, } return vchan_tx_prep(&bchan->vc, &async_desc->vd, flags); - -err_out: - kfree(async_desc); - return NULL; } /** From 28d8e07fc9478f8f14dd5dd4b2c382982fa12461 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 15 Dec 2020 15:13:47 +0200 Subject: [PATCH 226/547] MAINTAINERS: Add entry for Texas Instruments DMA drivers My employment with TI is coming to an end, it is my intention to look after the DMA drivers I have worked with over the years. Signed-off-by: Peter Ujfalusi Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20201215131348.11282-2-peter.ujfalusi@ti.com Signed-off-by: Vinod Koul --- MAINTAINERS | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 546aa66428c9..217866b668b9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17553,6 +17553,19 @@ S: Supported F: Documentation/devicetree/bindings/iio/dac/ti,dac7612.txt F: drivers/iio/dac/ti-dac7612.c +TEXAS INSTRUMENTS DMA DRIVERS +M: Peter Ujfalusi +L: dmaengine@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/dma/ti-dma-crossbar.txt +F: Documentation/devicetree/bindings/dma/ti-edma.txt +F: Documentation/devicetree/bindings/dma/ti/ +F: drivers/dma/ti/ +X: drivers/dma/ti/cppi41.c +F: include/linux/dma/k3-udma-glue.h +F: include/linux/dma/ti-cppi5.h +F: include/linux/dma/k3-psil.h + TEXAS INSTRUMENTS' SYSTEM CONTROL INTERFACE (TISCI) PROTOCOL DRIVER M: Nishanth Menon M: Tero Kristo From cc465fa269bc0dc63a1ab7384110e4079fb40421 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 15 Dec 2020 15:13:48 +0200 Subject: [PATCH 227/547] dt-bindings: dma: ti: Update maintainer and author information My employment with TI is coming to an end, add the copyright and author comments as they due and change the maintainer mail address. Signed-off-by: Peter Ujfalusi Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20201215131348.11282-3-peter.ujfalusi@ti.com Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml | 4 +++- Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml | 4 +++- Documentation/devicetree/bindings/dma/ti/k3-udma.yaml | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml index b15f68c499cb..df29d59d13a8 100644 --- a/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml +++ b/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml @@ -1,4 +1,6 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) 2020 Texas Instruments Incorporated +# Author: Peter Ujfalusi %YAML 1.2 --- $id: http://devicetree.org/schemas/dma/ti/k3-bcdma.yaml# @@ -7,7 +9,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Texas Instruments K3 DMSS BCDMA Device Tree Bindings maintainers: - - Peter Ujfalusi + - Peter Ujfalusi description: | The Block Copy DMA (BCDMA) is intended to perform similar functions as the TR diff --git a/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml index b13ab60cd740..ea19d12a9337 100644 --- a/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml +++ b/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml @@ -1,4 +1,6 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) 2020 Texas Instruments Incorporated +# Author: Peter Ujfalusi %YAML 1.2 --- $id: http://devicetree.org/schemas/dma/ti/k3-pktdma.yaml# @@ -7,7 +9,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Texas Instruments K3 DMSS PKTDMA Device Tree Bindings maintainers: - - Peter Ujfalusi + - Peter Ujfalusi description: | The Packet DMA (PKTDMA) is intended to perform similar functions as the packet diff --git a/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml index 9a87fd9041eb..6a09bbf83d46 100644 --- a/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml +++ b/Documentation/devicetree/bindings/dma/ti/k3-udma.yaml @@ -1,4 +1,6 @@ # SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) 2019 Texas Instruments Incorporated +# Author: Peter Ujfalusi %YAML 1.2 --- $id: http://devicetree.org/schemas/dma/ti/k3-udma.yaml# @@ -7,7 +9,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Texas Instruments K3 NAVSS Unified DMA Device Tree Bindings maintainers: - - Peter Ujfalusi + - Peter Ujfalusi description: | The UDMA-P is intended to perform similar (but significantly upgraded) From 3deba4d8f07be264b21e81d604c6b569a41a33b5 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 29 Dec 2020 09:34:28 +0100 Subject: [PATCH 228/547] ALSA: usb-audio: Add quirk for BOSS AD-10 BOSS AD-10 requires the very same quirk like other BOSS devices to enable the special implicit feedback mode. Reported-and-tested-by: Martin Passing Link: https://lore.kernel.org/r/20201229083428.20467-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/implicit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/implicit.c b/sound/usb/implicit.c index eb3a4c433c3e..70b9777b2e63 100644 --- a/sound/usb/implicit.c +++ b/sound/usb/implicit.c @@ -78,6 +78,7 @@ static const struct snd_usb_implicit_fb_match playback_implicit_fb_quirks[] = { IMPLICIT_FB_SKIP_DEV(0x0582, 0x01d6), /* BOSS GT-1 */ IMPLICIT_FB_SKIP_DEV(0x0582, 0x01d8), /* BOSS Katana */ IMPLICIT_FB_SKIP_DEV(0x0582, 0x01e5), /* BOSS GT-001 */ + IMPLICIT_FB_SKIP_DEV(0x0582, 0x0203), /* BOSS AD-10 */ {} /* terminator */ }; @@ -89,6 +90,7 @@ static const struct snd_usb_implicit_fb_match capture_implicit_fb_quirks[] = { IMPLICIT_FB_FIXED_DEV(0x0582, 0x01d6, 0x0d, 0x01), /* BOSS GT-1 */ IMPLICIT_FB_FIXED_DEV(0x0582, 0x01d8, 0x0d, 0x01), /* BOSS Katana */ IMPLICIT_FB_FIXED_DEV(0x0582, 0x01e5, 0x0d, 0x01), /* BOSS GT-001 */ + IMPLICIT_FB_FIXED_DEV(0x0582, 0x0203, 0x0d, 0x01), /* BOSS AD-10 */ {} /* terminator */ }; From cffa4b2122f5f3e53cf3d529bbc74651f95856d5 Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Tue, 29 Dec 2020 18:50:46 +0800 Subject: [PATCH 229/547] regmap: debugfs: Fix a memory leak when calling regmap_attach_dev After initializing the regmap through syscon_regmap_lookup_by_compatible, then regmap_attach_dev to the device, because the debugfs_name has been allocated, there is no need to redistribute it again unreferenced object 0xd8399b80 (size 64): comm "swapper/0", pid 1, jiffies 4294937641 (age 278.590s) hex dump (first 32 bytes): 64 75 6d 6d 79 2d 69 6f 6d 75 78 63 2d 67 70 72 dummy-iomuxc-gpr 40 32 30 65 34 30 30 30 00 7f 52 5b d8 7e 42 69 @20e4000..R[.~Bi backtrace: [] kasprintf+0x2c/0x54 [<6ad3bbc2>] regmap_debugfs_init+0xdc/0x2fc [] __regmap_init+0xc38/0xd88 [<1f7e0609>] of_syscon_register+0x168/0x294 [<735e8766>] device_node_get_regmap+0x6c/0x98 [] imx6ul_init_machine+0x20/0x88 [<0456565b>] customize_machine+0x1c/0x30 [] do_one_initcall+0x80/0x3ac [<7e584867>] kernel_init_freeable+0x170/0x1f0 [<80074741>] kernel_init+0x8/0x120 [<285d6f28>] ret_from_fork+0x14/0x20 [<00000000>] 0x0 Fixes: 9b947a13e7f6 ("regmap: use debugfs even when no device") Signed-off-by: Xiaolei Wang Link: https://lore.kernel.org/r/20201229105046.41984-1-xiaolei.wang@windriver.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-debugfs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c index 8dfac7f3ed7a..bf03cd343be2 100644 --- a/drivers/base/regmap/regmap-debugfs.c +++ b/drivers/base/regmap/regmap-debugfs.c @@ -582,18 +582,25 @@ void regmap_debugfs_init(struct regmap *map) devname = dev_name(map->dev); if (name) { - map->debugfs_name = kasprintf(GFP_KERNEL, "%s-%s", + if (!map->debugfs_name) { + map->debugfs_name = kasprintf(GFP_KERNEL, "%s-%s", devname, name); + if (!map->debugfs_name) + return; + } name = map->debugfs_name; } else { name = devname; } if (!strcmp(name, "dummy")) { - kfree(map->debugfs_name); + if (!map->debugfs_name) + kfree(map->debugfs_name); map->debugfs_name = kasprintf(GFP_KERNEL, "dummy%d", dummy_index); + if (!map->debugfs_name) + return; name = map->debugfs_name; dummy_index++; } From ede090f5a438e97d0586f64067bbb956e30a2a31 Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Tue, 29 Dec 2020 13:27:41 +0800 Subject: [PATCH 230/547] spi: altera: fix return value for altera_spi_txrx() This patch fixes the return value for altera_spi_txrx. It should return 1 for interrupt transfer mode, and return 0 for polling transfer mode. The altera_spi_txrx() implements the spi_controller.transfer_one callback. According to the spi-summary.rst, the transfer_one should return 0 when transfer is finished, return 1 when transfer is still in progress. Signed-off-by: Xu Yilun Link: https://lore.kernel.org/r/1609219662-27057-2-git-send-email-yilun.xu@intel.com Signed-off-by: Mark Brown --- drivers/spi/spi-altera.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/spi/spi-altera.c b/drivers/spi/spi-altera.c index 809bfff3690a..cbc4c28c1541 100644 --- a/drivers/spi/spi-altera.c +++ b/drivers/spi/spi-altera.c @@ -189,24 +189,26 @@ static int altera_spi_txrx(struct spi_master *master, /* send the first byte */ altera_spi_tx_word(hw); - } else { - while (hw->count < hw->len) { - altera_spi_tx_word(hw); - for (;;) { - altr_spi_readl(hw, ALTERA_SPI_STATUS, &val); - if (val & ALTERA_SPI_STATUS_RRDY_MSK) - break; - - cpu_relax(); - } - - altera_spi_rx_word(hw); - } - spi_finalize_current_transfer(master); + return 1; } - return t->len; + while (hw->count < hw->len) { + altera_spi_tx_word(hw); + + for (;;) { + altr_spi_readl(hw, ALTERA_SPI_STATUS, &val); + if (val & ALTERA_SPI_STATUS_RRDY_MSK) + break; + + cpu_relax(); + } + + altera_spi_rx_word(hw); + } + spi_finalize_current_transfer(master); + + return 0; } static irqreturn_t altera_spi_irq(int irq, void *dev) From da4282c17d695b9311608aa63b3c633e649aadea Mon Sep 17 00:00:00 2001 From: Jiang Wang Date: Thu, 24 Dec 2020 01:12:42 +0000 Subject: [PATCH 231/547] selftests/bpf: Fix a compile error for BPF_F_BPRM_SECUREEXEC When CONFIG_BPF_LSM is not configured, running bpf selftesting will show BPF_F_BPRM_SECUREEXEC undefined error for bprm_opts.c. The problem is that bprm_opts.c includes vmliunx.h. The vmlinux.h is generated by "bpftool btf dump file ./vmlinux format c". On the other hand, BPF_F_BPRM_SECUREEXEC is defined in include/uapi/linux/bpf.h and used only in bpf_lsm.c. When CONFIG_BPF_LSM is not set, bpf_lsm will not be compiled, so vmlinux.h will not include definition of BPF_F_BPRM_SECUREEXEC. Ideally, we want to compile bpf selftest regardless of the configuration setting, so change the include file from vmlinux.h to bpf.h. Signed-off-by: Jiang Wang Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20201224011242.585967-1-jiang.wang@bytedance.com --- tools/testing/selftests/bpf/progs/bprm_opts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/bprm_opts.c b/tools/testing/selftests/bpf/progs/bprm_opts.c index 5bfef2887e70..418d9c6d4952 100644 --- a/tools/testing/selftests/bpf/progs/bprm_opts.c +++ b/tools/testing/selftests/bpf/progs/bprm_opts.c @@ -4,7 +4,7 @@ * Copyright 2020 Google LLC. */ -#include "vmlinux.h" +#include #include #include #include From a694ffed876575d1df1a47067444047182de4354 Mon Sep 17 00:00:00 2001 From: Iskren Chernev Date: Mon, 28 Dec 2020 23:31:30 +0200 Subject: [PATCH 232/547] drm/msm: Fix null dereference in _msm_gem_new The crash was caused by locking an uninitialized lock during init of drm_gem_object. The lock changed in the breaking commit, but the init was not moved accordingly. 8<--- cut here --- Unable to handle kernel NULL pointer dereference at virtual address 00000000 pgd = (ptrval) [00000000] *pgd=00000000 Internal error: Oops: 5 [#1] PREEMPT SMP ARM Modules linked in: msm(+) qcom_spmi_vadc qcom_vadc_common dm_mod usb_f_rndis rmi_i2c rmi_core qnoc_msm8974 icc_smd_rpm pm8941_pwrkey CPU: 2 PID: 1020 Comm: udevd Not tainted 5.10.0-postmarketos-qcom-msm8974 #8 Hardware name: Generic DT based system PC is at ww_mutex_lock+0x20/0xb0 LR is at _msm_gem_new+0x13c/0x298 [msm] pc : [] lr : [] psr: 20000013 sp : c36e7ad0 ip : c3b3d800 fp : 00000000 r10: 00000001 r9 : c3b22800 r8 : 00000000 r7 : c3b23000 r6 : c3b3d600 r5 : c3b3d600 r4 : 00000000 r3 : c34b4780 r2 : c3b3d6f4 r1 : 00000000 r0 : 00000000 Flags: nzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none Control: 10c5787d Table: 03ae406a DAC: 00000051 Process udevd (pid: 1020, stack limit = 0x(ptrval)) Stack: (0xc36e7ad0 to 0xc36e8000) [...] [] (ww_mutex_lock) from [] (_msm_gem_new+0x13c/0x298 [msm]) [] (_msm_gem_new [msm]) from [] (_msm_gem_kernel_new+0x20/0x190 [msm]) [] (_msm_gem_kernel_new [msm]) from [] (msm_gem_kernel_new+0x24/0x2c [msm]) [] (msm_gem_kernel_new [msm]) from [] (msm_gpu_init+0x308/0x548 [msm]) [] (msm_gpu_init [msm]) from [] (adreno_gpu_init+0x13c/0x240 [msm]) [] (adreno_gpu_init [msm]) from [] (a3xx_gpu_init+0x78/0x1dc [msm]) [] (a3xx_gpu_init [msm]) from [] (adreno_bind+0x1cc/0x274 [msm]) [] (adreno_bind [msm]) from [] (component_bind_all+0x11c/0x278) [] (component_bind_all) from [] (msm_drm_bind+0x18c/0x5b4 [msm]) [] (msm_drm_bind [msm]) from [] (try_to_bring_up_master+0x200/0x2c8) [] (try_to_bring_up_master) from [] (component_master_add_with_match+0xc8/0xfc) [] (component_master_add_with_match) from [] (msm_pdev_probe+0x288/0x2c4 [msm]) [] (msm_pdev_probe [msm]) from [] (platform_drv_probe+0x48/0x98) [] (platform_drv_probe) from [] (really_probe+0x108/0x528) [] (really_probe) from [] (driver_probe_device+0x78/0x1d4) [] (driver_probe_device) from [] (device_driver_attach+0xa8/0xb0) [] (device_driver_attach) from [] (__driver_attach+0xb4/0x154) [] (__driver_attach) from [] (bus_for_each_dev+0x78/0xb8) [] (bus_for_each_dev) from [] (bus_add_driver+0x10c/0x208) [] (bus_add_driver) from [] (driver_register+0x88/0x118) [] (driver_register) from [] (do_one_initcall+0x50/0x2b0) [] (do_one_initcall) from [] (do_init_module+0x60/0x288) [] (do_init_module) from [] (sys_finit_module+0xd4/0x120) [] (sys_finit_module) from [] (ret_fast_syscall+0x0/0x54) Exception stack(0xc36e7fa8 to 0xc36e7ff0) 7fa0: 00020000 00000000 00000007 b6edd5b0 00000000 b6f2ff20 7fc0: 00020000 00000000 0000017b 0000017b b6eef980 bedc3a54 00473c99 00000000 7fe0: b6edd5b0 bedc3918 b6ed8a5f b6f6a8b0 Code: e3c3303f e593300c e1a04000 f590f000 (e1940f9f) ---[ end trace 277e2a3da40bbb76 ]--- Fixes: 6c0e3ea250476 ("drm/msm/gem: Switch over to obj->resv for locking") Signed-off-by: Iskren Chernev Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index a21be5b910ff..d9a5a1895f3d 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -1101,6 +1101,8 @@ static struct drm_gem_object *_msm_gem_new(struct drm_device *dev, struct msm_gem_vma *vma; struct page **pages; + drm_gem_private_object_init(dev, obj, size); + msm_gem_lock(obj); vma = add_vma(obj, NULL); @@ -1112,7 +1114,6 @@ static struct drm_gem_object *_msm_gem_new(struct drm_device *dev, to_msm_bo(obj)->vram_node = &vma->node; - drm_gem_private_object_init(dev, obj, size); pages = get_pages(obj); if (IS_ERR(pages)) { From 07fcad0d726d5da7c43f1c8e8fdb66c93a140ca5 Mon Sep 17 00:00:00 2001 From: Iskren Chernev Date: Mon, 28 Dec 2020 23:31:31 +0200 Subject: [PATCH 233/547] drm/msm: Ensure get_pages is called when locked get_pages is only called in a locked context. Add a WARN_ON to make sure it stays that way. Signed-off-by: Iskren Chernev Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index d9a5a1895f3d..114c0711a302 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -96,6 +96,8 @@ static struct page **get_pages(struct drm_gem_object *obj) { struct msm_gem_object *msm_obj = to_msm_bo(obj); + WARN_ON(!msm_gem_is_locked(obj)); + if (!msm_obj->pages) { struct drm_device *dev = obj->dev; struct page **p; @@ -1114,8 +1116,9 @@ static struct drm_gem_object *_msm_gem_new(struct drm_device *dev, to_msm_bo(obj)->vram_node = &vma->node; - + msm_gem_lock(obj); pages = get_pages(obj); + msm_gem_unlock(obj); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto fail; From 77788775c7132a8d93c6930ab1bd84fc743c7cb7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 29 Dec 2020 10:50:46 -0700 Subject: [PATCH 234/547] io_uring: don't assume mm is constant across submits If we COW the identity, we assume that ->mm never changes. But this isn't true of multiple processes end up sharing the ring. Hence treat id->mm like like any other process compontent when it comes to the identity mapping. This is pretty trivial, just moving the existing grab into io_grab_identity(), and including a check for the match. Cc: stable@vger.kernel.org # 5.10 Fixes: 1e6fa5216a0e ("io_uring: COW io_identity on mismatch") Reported-by: Christian Brauner : Tested-by: Christian Brauner : Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7e35283fc0b1..eb4620ff638e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1501,6 +1501,13 @@ static bool io_grab_identity(struct io_kiocb *req) spin_unlock_irq(&ctx->inflight_lock); req->work.flags |= IO_WQ_WORK_FILES; } + if (!(req->work.flags & IO_WQ_WORK_MM) && + (def->work_flags & IO_WQ_WORK_MM)) { + if (id->mm != current->mm) + return false; + mmgrab(id->mm); + req->work.flags |= IO_WQ_WORK_MM; + } return true; } @@ -1525,13 +1532,6 @@ static void io_prep_async_work(struct io_kiocb *req) req->work.flags |= IO_WQ_WORK_UNBOUND; } - /* ->mm can never change on us */ - if (!(req->work.flags & IO_WQ_WORK_MM) && - (def->work_flags & IO_WQ_WORK_MM)) { - mmgrab(id->mm); - req->work.flags |= IO_WQ_WORK_MM; - } - /* if we fail grabbing identity, we must COW, regrab, and retry */ if (io_grab_identity(req)) return; From b000700d6db50c933ce8b661154e26cf4ad06dba Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sat, 26 Dec 2020 15:27:14 +0800 Subject: [PATCH 235/547] habanalabs: Fix memleak in hl_device_reset When kzalloc() fails, we should execute hl_mmu_fini() to release the MMU module. It's the same when hl_ctx_init() fails. Signed-off-by: Dinghao Liu Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 0749c92cbcf6..1456eabf9601 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1092,6 +1092,7 @@ kill_processes: GFP_KERNEL); if (!hdev->kernel_ctx) { rc = -ENOMEM; + hl_mmu_fini(hdev); goto out_err; } @@ -1103,6 +1104,7 @@ kill_processes: "failed to init kernel ctx in hard reset\n"); kfree(hdev->kernel_ctx); hdev->kernel_ctx = NULL; + hl_mmu_fini(hdev); goto out_err; } } From 7cf22a1c88c05ea3807f95b1edfebb729016ae52 Mon Sep 17 00:00:00 2001 From: Harish Date: Tue, 29 Dec 2020 15:14:22 -0800 Subject: [PATCH 236/547] selftests/vm: fix building protection keys test Commit d8cbe8bfa7d ("tools/testing/selftests/vm: fix build error") tried to include a ARCH check for powerpc, however ARCH is not defined in the Makefile before including lib.mk. This makes test building to skip on both x86 and powerpc. Fix the arch check by replacing it using machine type as it is already defined and used in the test. Link: https://lkml.kernel.org/r/20201215100402.257376-1-harish@linux.ibm.com Fixes: d8cbe8bfa7d ("tools/testing/selftests/vm: fix build error") Signed-off-by: Harish Reviewed-by: Sandipan Das Cc: Shuah Khan Cc: Sandipan Das Cc: John Hubbard Cc: Dave Hansen Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 9a25307f6115..d42115e4284d 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -4,7 +4,7 @@ include local_config.mk uname_M := $(shell uname -m 2>/dev/null || echo not) -MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/') +MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') # Without this, failed build products remain, with up-to-date timestamps, # thus tricking Make (and you!) into believing that All Is Well, in subsequent @@ -43,7 +43,7 @@ TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd -ifeq ($(ARCH),x86_64) +ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_64bit_program.c) CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_program.c -no-pie) @@ -65,13 +65,13 @@ TEST_GEN_FILES += $(BINARIES_64) endif else -ifneq (,$(findstring $(ARCH),powerpc)) +ifneq (,$(findstring $(MACHINE),ppc64)) TEST_GEN_FILES += protection_keys endif endif -ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64)) +ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64)) TEST_GEN_FILES += va_128TBswitch TEST_GEN_FILES += virtual_address_range TEST_GEN_FILES += write_to_hugetlbfs @@ -84,7 +84,7 @@ TEST_FILES := test_vmalloc.sh KSFT_KHDR_INSTALL := 1 include ../lib.mk -ifeq ($(ARCH),x86_64) +ifeq ($(MACHINE),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) From e7dd91c456a8cdbcd7066997d15e36d14276a949 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 29 Dec 2020 15:14:25 -0800 Subject: [PATCH 237/547] mm/hugetlb: fix deadlock in hugetlb_cow error path syzbot reported the deadlock here [1]. The issue is in hugetlb cow error handling when there are not enough huge pages for the faulting task which took the original reservation. It is possible that other (child) tasks could have consumed pages associated with the reservation. In this case, we want the task which took the original reservation to succeed. So, we unmap any associated pages in children so that they can be used by the faulting task that owns the reservation. The unmapping code needs to hold i_mmap_rwsem in write mode. However, due to commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") we are already holding i_mmap_rwsem in read mode when hugetlb_cow is called. Technically, i_mmap_rwsem does not need to be held in read mode for COW mappings as they can not share pmd's. Modifying the fault code to not take i_mmap_rwsem in read mode for COW (and other non-sharable) mappings is too involved for a stable fix. Instead, we simply drop the hugetlb_fault_mutex and i_mmap_rwsem before unmapping. This is OK as it is technically not needed. They are reacquired after unmapping as expected by calling code. Since this is done in an uncommon error path, the overhead of dropping and reacquiring mutexes is acceptable. While making changes, remove redundant BUG_ON after unmap_ref_private. [1] https://lkml.kernel.org/r/000000000000b73ccc05b5cf8558@google.com Link: https://lkml.kernel.org/r/4c5781b8-3b00-761e-c0c7-c5edebb6ec1a@oracle.com Fixes: c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") Signed-off-by: Mike Kravetz Reported-by: syzbot+5eee4145df3c15e96625@syzkaller.appspotmail.com Cc: Naoya Horiguchi Cc: Michal Hocko Cc: Hugh Dickins Cc: "Aneesh Kumar K . V" Cc: Davidlohr Bueso Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cbf32d2824fd..a2602969873d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4105,10 +4105,30 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx; + u32 hash; + put_page(old_page); BUG_ON(huge_pte_none(pte)); + /* + * Drop hugetlb_fault_mutex and i_mmap_rwsem before + * unmapping. unmapping needs to hold i_mmap_rwsem + * in write mode. Dropping i_mmap_rwsem in read mode + * here is OK as COW mappings do not interact with + * PMD sharing. + * + * Reacquire both after unmap operation. + */ + idx = vma_hugecache_offset(h, vma, haddr); + hash = hugetlb_fault_mutex_hash(mapping, idx); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); + unmap_ref_private(mm, vma, old_page, haddr); - BUG_ON(huge_pte_none(pte)); + + i_mmap_lock_read(mapping); + mutex_lock(&hugetlb_fault_mutex_table[hash]); spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && From 3a176b94609a18f5f8bac7ddbf8923bd737262db Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 29 Dec 2020 15:14:28 -0800 Subject: [PATCH 238/547] Revert "kbuild: avoid static_assert for genksyms" This reverts commit 14dc3983b5dff513a90bd5a8cc90acaf7867c3d0. Macro Elver had sent a fix proper fix earlier, and also pointed out corner cases: "I guess what you propose is simpler, but might still have corner cases where we still get warnings. In particular, if some file (for whatever reason) does not include build_bug.h and uses a raw _Static_assert(), then we still get warnings. E.g. I see 1 user of raw _Static_assert() (drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h )." I believe the raw use of _Static_assert() should be allowed, so this should be fixed in genksyms. Even after commit 14dc3983b5df ("kbuild: avoid static_assert for genksyms"), I confirmed the following test code emits the warning. ---------------->8---------------- #include _Static_assert((1 ?: 0), ""); void foo(void) { } EXPORT_SYMBOL(foo); ---------------->8---------------- WARNING: modpost: EXPORT symbol "foo" [vmlinux] version generation failed, symbol will not be versioned. Now that commit 869b91992bce ("genksyms: Ignore module scoped _Static_assert()") fixed this issue properly, the workaround should be reverted. Link: https://lkml.org/lkml/2020/12/10/845 Link: https://lkml.kernel.org/r/20201219183911.181442-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/build_bug.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h index 7bb66e15b481..e3a0be2c90ad 100644 --- a/include/linux/build_bug.h +++ b/include/linux/build_bug.h @@ -77,9 +77,4 @@ #define static_assert(expr, ...) __static_assert(expr, ##__VA_ARGS__, #expr) #define __static_assert(expr, msg, ...) _Static_assert(expr, msg) -#ifdef __GENKSYMS__ -/* genksyms gets confused by _Static_assert */ -#define _Static_assert(expr, ...) -#endif - #endif /* _LINUX_BUILD_BUG_H */ From 5dbdb2d87c294401a22e6a6002f08ebc9fbea38b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 29 Dec 2020 15:14:31 -0800 Subject: [PATCH 239/547] checkpatch: prefer strscpy to strlcpy Prefer strscpy over the deprecated strlcpy function. Link: https://lkml.kernel.org/r/19fe91084890e2c16fe56f960de6c570a93fa99b.camel@perches.com Signed-off-by: Joe Perches Requested-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 00085308ed9d..92e888ed939f 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6646,6 +6646,12 @@ sub process { # } # } +# strlcpy uses that should likely be strscpy + if ($line =~ /\bstrlcpy\s*\(/) { + WARN("STRLCPY", + "Prefer strscpy over strlcpy - see: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw\@mail.gmail.com/\n" . $herecurr); + } + # typecasts on min/max could be min_t/max_t if ($perl_version_ok && defined $stat && From 6d87d0ece58bc0022ca5247721a8eb06ef66b673 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Tue, 29 Dec 2020 15:14:34 -0800 Subject: [PATCH 240/547] mm: add prototype for __add_to_page_cache_locked() Otherwise it causes a gcc warning: mm/filemap.c:830:14: warning: no previous prototype for `__add_to_page_cache_locked' [-Wmissing-prototypes] A previous attempt to make this function static led to compilation errors when CONFIG_DEBUG_INFO_BTF is enabled because __add_to_page_cache_locked() is referred to by BPF code. Adding a prototype will silence the warning. Link: https://lkml.kernel.org/r/1608693702-4665-1-git-send-email-jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Cc: Alex Shi Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5299b90a6c40..c1e908184442 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -216,6 +216,13 @@ int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +/* + * Any attempt to mark this function as static leads to build failure + * when CONFIG_DEBUG_INFO_BTF is enabled because __add_to_page_cache_locked() + * is referred to by BPF code. This must be visible for error injection. + */ +int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, + pgoff_t index, gfp_t gfp, void **shadowp); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) From dc2da7b45ffe954a0090f5d0310ed7b0b37d2bd2 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 29 Dec 2020 15:14:37 -0800 Subject: [PATCH 241/547] mm: memmap defer init doesn't work as expected VMware observed a performance regression during memmap init on their platform, and bisected to commit 73a6e474cb376 ("mm: memmap_init: iterate over memblock regions rather that check each PFN") causing it. Before the commit: [0.033176] Normal zone: 1445888 pages used for memmap [0.033176] Normal zone: 89391104 pages, LIFO batch:63 [0.035851] ACPI: PM-Timer IO Port: 0x448 With commit [0.026874] Normal zone: 1445888 pages used for memmap [0.026875] Normal zone: 89391104 pages, LIFO batch:63 [2.028450] ACPI: PM-Timer IO Port: 0x448 The root cause is the current memmap defer init doesn't work as expected. Before, memmap_init_zone() was used to do memmap init of one whole zone, to initialize all low zones of one numa node, but defer memmap init of the last zone in that numa node. However, since commit 73a6e474cb376, function memmap_init() is adapted to iterater over memblock regions inside one zone, then call memmap_init_zone() to do memmap init for each region. E.g, on VMware's system, the memory layout is as below, there are two memory regions in node 2. The current code will mistakenly initialize the whole 1st region [mem 0xab00000000-0xfcffffffff], then do memmap defer to iniatialize only one memmory section on the 2nd region [mem 0x10000000000-0x1033fffffff]. In fact, we only expect to see that there's only one memory section's memmap initialized. That's why more time is costed at the time. [ 0.008842] ACPI: SRAT: Node 0 PXM 0 [mem 0x00000000-0x0009ffff] [ 0.008842] ACPI: SRAT: Node 0 PXM 0 [mem 0x00100000-0xbfffffff] [ 0.008843] ACPI: SRAT: Node 0 PXM 0 [mem 0x100000000-0x55ffffffff] [ 0.008844] ACPI: SRAT: Node 1 PXM 1 [mem 0x5600000000-0xaaffffffff] [ 0.008844] ACPI: SRAT: Node 2 PXM 2 [mem 0xab00000000-0xfcffffffff] [ 0.008845] ACPI: SRAT: Node 2 PXM 2 [mem 0x10000000000-0x1033fffffff] Now, let's add a parameter 'zone_end_pfn' to memmap_init_zone() to pass down the real zone end pfn so that defer_init() can use it to judge whether defer need be taken in zone wide. Link: https://lkml.kernel.org/r/20201223080811.16211-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20201223080811.16211-2-bhe@redhat.com Fixes: commit 73a6e474cb376 ("mm: memmap_init: iterate over memblock regions rather that check each PFN") Signed-off-by: Baoquan He Reported-by: Rahul Gopakumar Reviewed-by: Mike Rapoport Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 4 ++-- include/linux/mm.h | 5 +++-- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 8 +++++--- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 9b5acf8fb092..e76386a3479e 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -536,7 +536,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg) if (map_start < map_end) memmap_init_zone((unsigned long)(map_end - map_start), - args->nid, args->zone, page_to_pfn(map_start), + args->nid, args->zone, page_to_pfn(map_start), page_to_pfn(map_end), MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); return 0; } @@ -546,7 +546,7 @@ memmap_init (unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { if (!vmem_map) { - memmap_init_zone(size, nid, zone, start_pfn, + memmap_init_zone(size, nid, zone, start_pfn, start_pfn + size, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } else { struct page *start; diff --git a/include/linux/mm.h b/include/linux/mm.h index c1e908184442..ecdf8a8cd6ae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2439,8 +2439,9 @@ extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); -extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum meminit_context, struct vmem_altmap *, int migratetype); +extern void memmap_init_zone(unsigned long, int, unsigned long, + unsigned long, unsigned long, enum meminit_context, + struct vmem_altmap *, int migratetype); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index af41fb990820..f9d57b9be8c7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -713,7 +713,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ - memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, + memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0, MEMINIT_HOTPLUG, altmap, migratetype); set_zone_contiguous(zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7a2c89b21115..bdbec4c98173 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -423,6 +423,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) return false; + if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) + return true; /* * We start only with one section of pages, more pages are added as * needed until the rest of deferred pages are initialized. @@ -6116,7 +6118,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * zone stats (e.g., nr_isolate_pageblock) are touched. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn, + unsigned long start_pfn, unsigned long zone_end_pfn, enum meminit_context context, struct vmem_altmap *altmap, int migratetype) { @@ -6152,7 +6154,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context == MEMINIT_EARLY) { if (overlap_memmap_init(zone, &pfn)) continue; - if (defer_init(nid, pfn, end_pfn)) + if (defer_init(nid, pfn, zone_end_pfn)) break; } @@ -6266,7 +6268,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid, if (end_pfn > start_pfn) { size = end_pfn - start_pfn; - memmap_init_zone(size, nid, zone, start_pfn, + memmap_init_zone(size, nid, zone, start_pfn, range_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } } From e05986ee7a5814bec0e0075d813daca3d46e4a9e Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 29 Dec 2020 15:14:40 -0800 Subject: [PATCH 242/547] mm/mremap.c: fix extent calculation When `next < old_addr`, `next - old_addr` arithmetic underflows causing `extent` to be incorrect. Make `extent` the smaller of `next - old_addr` or `old_end - old_addr`. Link: https://lkml.kernel.org/r/20201219170433.2418867-1-kaleshsingh@google.com Fixes: c49dd34018026 ("mm: speedup mremap on 1GB or larger regions") Signed-off-by: Kalesh Singh Reported-by: Guenter Roeck Tested-by: Guenter Roeck Cc: Suren Baghdasaryan Cc: Minchan Kim Cc: Lokesh Gidra Cc: Helge Deller Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index c5590afe7165..f554320281cc 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -358,7 +358,9 @@ static unsigned long get_extent(enum pgt_entry entry, unsigned long old_addr, next = (old_addr + size) & mask; /* even if next overflowed, extent below will be ok */ - extent = (next > old_end) ? old_end - old_addr : next - old_addr; + extent = next - old_addr; + if (extent > old_end - old_addr) + extent = old_end - old_addr; next = (new_addr + size) & mask; if (extent > next - new_addr) extent = next - new_addr; From 111fe7186b29d172729db5e294875b9fc7a0ec1d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 29 Dec 2020 15:14:43 -0800 Subject: [PATCH 243/547] mm: generalise COW SMC TLB flushing race comment I'm not sure if I'm completely missing something here, but AFAIKS the reference to the mysterious "COW SMC race" confuses the issue. The original changelog and mailing list thread didn't help me either. This SMC race is where the problem was detected, but isn't the general problem bigger and more obvious: that the new PTE could be picked up at any time by any TLB while entries for the old PTE exist in other TLBs before the TLB flush takes effect? The case where the iTLB and dTLB of a CPU are pointing at different pages is an interesting one but follows from the general problem. The other (minor) thing with the comment I think it makes it a bit clearer to say what the old code was doing (i.e., it avoids the race as opposed to what?). References: 4ce072f1faf29 ("mm: fix a race condition under SMC + COW") Link: https://lkml.kernel.org/r/20201215121119.351650-1-npiggin@gmail.com Reviewed-by: Matthew Wilcox (Oracle) Cc: Suresh Siddha Cc: "David S. Miller" Cc: Hugh Dickins Cc: Peter Zijlstra Cc: Suresh Siddha Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 7d608765932b..feff48e1465a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2892,11 +2892,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) entry = mk_pte(new_page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + /* * Clear the pte entry and flush it first, before updating the - * pte with the new entry. This will avoid a race condition - * seen in the presence of one thread doing SMC and another - * thread doing COW. + * pte with the new entry, to keep TLBs on different CPUs in + * sync. This code used to set the new PTE then flush TLBs, but + * that left a window where the new PTE could be loaded into + * some TLBs while the old PTE remains in others. */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); From 13384f6125ad7ebdcc8914fe1e03ded48ce76581 Mon Sep 17 00:00:00 2001 From: Walter Wu Date: Tue, 29 Dec 2020 15:14:46 -0800 Subject: [PATCH 244/547] kasan: fix null pointer dereference in kasan_record_aux_stack Syzbot reported the following [1]: BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 2d993067 P4D 2d993067 PUD 19a3c067 PMD 0 Oops: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 3852 Comm: kworker/1:2 Not tainted 5.10.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events free_ipc RIP: 0010:kasan_record_aux_stack+0x77/0xb0 Add null checking slab object from kasan_get_alloc_meta() in order to avoid null pointer dereference. [1] https://syzkaller.appspot.com/x/log.txt?x=10a82a50d00000 Link: https://lkml.kernel.org/r/20201228080018.23041-1-walter-zh.wu@mediatek.com Signed-off-by: Walter Wu Suggested-by: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/generic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 1dd5a0f99372..5106b84b07d4 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -337,6 +337,8 @@ void kasan_record_aux_stack(void *addr) cache = page->slab_cache; object = nearest_obj(cache, page, addr); alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return; alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); From 87dbc209ea04645fd2351981f09eff5d23f8e2e9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 29 Dec 2020 15:14:49 -0800 Subject: [PATCH 245/547] local64.h: make mandatory Make mandatory in include/asm-generic/Kbuild and remove all arch/*/include/asm/local64.h arch-specific files since they only #include . This fixes build errors on arch/c6x/ and arch/nios2/ for block/blk-iocost.c. Build-tested on 21 of 25 arch-es. (tools problems on the others) Yes, we could even rename to and change all #includes to use instead. Link: https://lkml.kernel.org/r/20201227024446.17018-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Suggested-by: Christoph Hellwig Reviewed-by: Masahiro Yamada Cc: Jens Axboe Cc: Ley Foon Tan Cc: Mark Salter Cc: Aurelien Jacquiot Cc: Peter Zijlstra Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/local64.h | 1 - arch/arc/include/asm/Kbuild | 1 - arch/arm/include/asm/Kbuild | 1 - arch/arm64/include/asm/Kbuild | 1 - arch/csky/include/asm/Kbuild | 1 - arch/h8300/include/asm/Kbuild | 1 - arch/hexagon/include/asm/Kbuild | 1 - arch/ia64/include/asm/local64.h | 1 - arch/m68k/include/asm/Kbuild | 1 - arch/microblaze/include/asm/Kbuild | 1 - arch/mips/include/asm/Kbuild | 1 - arch/nds32/include/asm/Kbuild | 1 - arch/openrisc/include/asm/Kbuild | 1 - arch/parisc/include/asm/Kbuild | 1 - arch/powerpc/include/asm/Kbuild | 1 - arch/riscv/include/asm/Kbuild | 1 - arch/s390/include/asm/Kbuild | 1 - arch/sh/include/asm/Kbuild | 1 - arch/sparc/include/asm/Kbuild | 1 - arch/x86/include/asm/local64.h | 1 - arch/xtensa/include/asm/Kbuild | 1 - include/asm-generic/Kbuild | 1 + 22 files changed, 1 insertion(+), 21 deletions(-) delete mode 100644 arch/alpha/include/asm/local64.h delete mode 100644 arch/ia64/include/asm/local64.h delete mode 100644 arch/x86/include/asm/local64.h diff --git a/arch/alpha/include/asm/local64.h b/arch/alpha/include/asm/local64.h deleted file mode 100644 index 36c93b5cc239..000000000000 --- a/arch/alpha/include/asm/local64.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index 81f4edec0c2a..3c1afa524b9c 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 generic-y += extable.h generic-y += kvm_para.h -generic-y += local64.h generic-y += mcs_spinlock.h generic-y += parport.h generic-y += user.h diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 4a0848aef207..03657ff8fbe3 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -2,7 +2,6 @@ generic-y += early_ioremap.h generic-y += extable.h generic-y += flat.h -generic-y += local64.h generic-y += parport.h generated-y += mach-types.h diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index ff9cbb631212..07ac208edc89 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 generic-y += early_ioremap.h -generic-y += local64.h generic-y += mcs_spinlock.h generic-y += qrwlock.h generic-y += qspinlock.h diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index 93372255984d..cc24bb8e539f 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -2,7 +2,6 @@ generic-y += asm-offsets.h generic-y += gpio.h generic-y += kvm_para.h -generic-y += local64.h generic-y += qrwlock.h generic-y += user.h generic-y += vmlinux.lds.h diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index ddf04f32b546..60ee7f0d60a8 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -2,7 +2,6 @@ generic-y += asm-offsets.h generic-y += extable.h generic-y += kvm_para.h -generic-y += local64.h generic-y += mcs_spinlock.h generic-y += parport.h generic-y += spinlock.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 373964bb177e..3ece3c93fe08 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -2,5 +2,4 @@ generic-y += extable.h generic-y += iomap.h generic-y += kvm_para.h -generic-y += local64.h generic-y += mcs_spinlock.h diff --git a/arch/ia64/include/asm/local64.h b/arch/ia64/include/asm/local64.h deleted file mode 100644 index 36c93b5cc239..000000000000 --- a/arch/ia64/include/asm/local64.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 1bff55aa2d54..0dbf9c5c6fae 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -2,6 +2,5 @@ generated-y += syscall_table.h generic-y += extable.h generic-y += kvm_para.h -generic-y += local64.h generic-y += mcs_spinlock.h generic-y += spinlock.h diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index 63bce836b9f1..29b0e557aa7c 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -2,7 +2,6 @@ generated-y += syscall_table.h generic-y += extable.h generic-y += kvm_para.h -generic-y += local64.h generic-y += mcs_spinlock.h generic-y += parport.h generic-y += syscalls.h diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 198b3bafdac9..95b4fa7bd0d1 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -6,7 +6,6 @@ generated-y += syscall_table_64_n64.h generated-y += syscall_table_64_o32.h generic-y += export.h generic-y += kvm_para.h -generic-y += local64.h generic-y += mcs_spinlock.h generic-y += parport.h generic-y += qrwlock.h diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild index ff1e94299317..82a4453c9c2d 100644 --- a/arch/nds32/include/asm/Kbuild +++ b/arch/nds32/include/asm/Kbuild @@ -4,6 +4,5 @@ generic-y += cmpxchg.h generic-y += export.h generic-y += gpio.h generic-y += kvm_para.h -generic-y += local64.h generic-y += parport.h generic-y += user.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 442f3d3bcd90..ca5987e11053 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 generic-y += extable.h generic-y += kvm_para.h -generic-y += local64.h generic-y += mcs_spinlock.h generic-y += qspinlock_types.h generic-y += qspinlock.h diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index f16c4db80116..4406475a2304 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -3,6 +3,5 @@ generated-y += syscall_table_32.h generated-y += syscall_table_64.h generated-y += syscall_table_c32.h generic-y += kvm_para.h -generic-y += local64.h generic-y += mcs_spinlock.h generic-y += user.h diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 90cd5c53af66..e1f9b4ea1c53 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -5,7 +5,6 @@ generated-y += syscall_table_c32.h generated-y += syscall_table_spu.h generic-y += export.h generic-y += kvm_types.h -generic-y += local64.h generic-y += mcs_spinlock.h generic-y += qrwlock.h generic-y += vtime.h diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild index 59dd7be55005..445ccc97305a 100644 --- a/arch/riscv/include/asm/Kbuild +++ b/arch/riscv/include/asm/Kbuild @@ -3,6 +3,5 @@ generic-y += early_ioremap.h generic-y += extable.h generic-y += flat.h generic-y += kvm_para.h -generic-y += local64.h generic-y += user.h generic-y += vmlinux.lds.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 319efa0e6d02..1a18d7b82f86 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -7,5 +7,4 @@ generated-y += unistd_nr.h generic-y += asm-offsets.h generic-y += export.h generic-y += kvm_types.h -generic-y += local64.h generic-y += mcs_spinlock.h diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 7435182ef846..fc44d9c88b41 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 generated-y += syscall_table.h generic-y += kvm_para.h -generic-y += local64.h generic-y += mcs_spinlock.h generic-y += parport.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 5269a704801f..3688fdae50e4 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -6,5 +6,4 @@ generated-y += syscall_table_64.h generated-y += syscall_table_c32.h generic-y += export.h generic-y += kvm_para.h -generic-y += local64.h generic-y += mcs_spinlock.h diff --git a/arch/x86/include/asm/local64.h b/arch/x86/include/asm/local64.h deleted file mode 100644 index 36c93b5cc239..000000000000 --- a/arch/x86/include/asm/local64.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 9718e9593564..854c5e07e867 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -2,7 +2,6 @@ generated-y += syscall_table.h generic-y += extable.h generic-y += kvm_para.h -generic-y += local64.h generic-y += mcs_spinlock.h generic-y += param.h generic-y += qrwlock.h diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 4365b9aa3e3f..267f6dfb8960 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -34,6 +34,7 @@ mandatory-y += kmap_size.h mandatory-y += kprobes.h mandatory-y += linkage.h mandatory-y += local.h +mandatory-y += local64.h mandatory-y += mm-arch-hooks.h mandatory-y += mmiowb.h mandatory-y += mmu.h From 8b0fac44bd1ff17016502b3c3533f5abb8456c65 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Tue, 29 Dec 2020 15:14:52 -0800 Subject: [PATCH 246/547] sizes.h: add SZ_8G/SZ_16G/SZ_32G macros Add these macros, since we can use them in drivers. Link: https://lkml.kernel.org/r/20201229072819.11183-1-sjhuang@iluvatar.ai Signed-off-by: Huang Shijie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sizes.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/sizes.h b/include/linux/sizes.h index 9874f6f67537..1ac79bcee2bb 100644 --- a/include/linux/sizes.h +++ b/include/linux/sizes.h @@ -44,6 +44,9 @@ #define SZ_2G 0x80000000 #define SZ_4G _AC(0x100000000, ULL) +#define SZ_8G _AC(0x200000000, ULL) +#define SZ_16G _AC(0x400000000, ULL) +#define SZ_32G _AC(0x800000000, ULL) #define SZ_64T _AC(0x400000000000, ULL) #endif /* __LINUX_SIZES_H__ */ From aa8c7db494d0a83ecae583aa193f1134ef25d506 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 29 Dec 2020 15:14:55 -0800 Subject: [PATCH 247/547] kdev_t: always inline major/minor helper functions Silly GCC doesn't always inline these trivial functions. Fixes the following warning: arch/x86/kernel/sys_ia32.o: warning: objtool: cp_stat64()+0xd8: call to new_encode_dev() with UACCESS enabled Link: https://lkml.kernel.org/r/984353b44a4484d86ba9f73884b7306232e25e30.1608737428.git.jpoimboe@redhat.com Signed-off-by: Josh Poimboeuf Reported-by: Randy Dunlap Acked-by: Randy Dunlap [build-tested] Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kdev_t.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h index 85b5151911cf..4856706fbfeb 100644 --- a/include/linux/kdev_t.h +++ b/include/linux/kdev_t.h @@ -21,61 +21,61 @@ }) /* acceptable for old filesystems */ -static inline bool old_valid_dev(dev_t dev) +static __always_inline bool old_valid_dev(dev_t dev) { return MAJOR(dev) < 256 && MINOR(dev) < 256; } -static inline u16 old_encode_dev(dev_t dev) +static __always_inline u16 old_encode_dev(dev_t dev) { return (MAJOR(dev) << 8) | MINOR(dev); } -static inline dev_t old_decode_dev(u16 val) +static __always_inline dev_t old_decode_dev(u16 val) { return MKDEV((val >> 8) & 255, val & 255); } -static inline u32 new_encode_dev(dev_t dev) +static __always_inline u32 new_encode_dev(dev_t dev) { unsigned major = MAJOR(dev); unsigned minor = MINOR(dev); return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12); } -static inline dev_t new_decode_dev(u32 dev) +static __always_inline dev_t new_decode_dev(u32 dev) { unsigned major = (dev & 0xfff00) >> 8; unsigned minor = (dev & 0xff) | ((dev >> 12) & 0xfff00); return MKDEV(major, minor); } -static inline u64 huge_encode_dev(dev_t dev) +static __always_inline u64 huge_encode_dev(dev_t dev) { return new_encode_dev(dev); } -static inline dev_t huge_decode_dev(u64 dev) +static __always_inline dev_t huge_decode_dev(u64 dev) { return new_decode_dev(dev); } -static inline int sysv_valid_dev(dev_t dev) +static __always_inline int sysv_valid_dev(dev_t dev) { return MAJOR(dev) < (1<<14) && MINOR(dev) < (1<<18); } -static inline u32 sysv_encode_dev(dev_t dev) +static __always_inline u32 sysv_encode_dev(dev_t dev) { return MINOR(dev) | (MAJOR(dev) << 18); } -static inline unsigned sysv_major(u32 dev) +static __always_inline unsigned sysv_major(u32 dev) { return (dev >> 18) & 0x3fff; } -static inline unsigned sysv_minor(u32 dev) +static __always_inline unsigned sysv_minor(u32 dev) { return dev & 0x3ffff; } From 36845663843fc59c5d794e3dc0641472e3e572da Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Tue, 29 Dec 2020 15:14:58 -0800 Subject: [PATCH 248/547] lib/genalloc: fix the overflow when size is too big Some graphic card has very big memory on chip, such as 32G bytes. In the following case, it will cause overflow: pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE); ret = gen_pool_add(pool, 0x1000000, SZ_32G, NUMA_NO_NODE); va = gen_pool_alloc(pool, SZ_4G); The overflow occurs in gen_pool_alloc_algo_owner(): .... size = nbits << order; .... The @nbits is "int" type, so it will overflow. Then the gen_pool_avail() will return the wrong value. This patch converts some "int" to "unsigned long", and changes the compare code in while. Link: https://lkml.kernel.org/r/20201229060657.3389-1-sjhuang@iluvatar.ai Signed-off-by: Huang Shijie Reported-by: Shi Jiasheng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/lib/genalloc.c b/lib/genalloc.c index 7f1244b5294a..dab97bb69df6 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -81,14 +81,14 @@ static int clear_bits_ll(unsigned long *addr, unsigned long mask_to_clear) * users set the same bit, one user will return remain bits, otherwise * return 0. */ -static int bitmap_set_ll(unsigned long *map, int start, int nr) +static int bitmap_set_ll(unsigned long *map, unsigned long start, unsigned long nr) { unsigned long *p = map + BIT_WORD(start); - const int size = start + nr; + const unsigned long size = start + nr; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); - while (nr - bits_to_set >= 0) { + while (nr >= bits_to_set) { if (set_bits_ll(p, mask_to_set)) return nr; nr -= bits_to_set; @@ -116,14 +116,15 @@ static int bitmap_set_ll(unsigned long *map, int start, int nr) * users clear the same bit, one user will return remain bits, * otherwise return 0. */ -static int bitmap_clear_ll(unsigned long *map, int start, int nr) +static unsigned long +bitmap_clear_ll(unsigned long *map, unsigned long start, unsigned long nr) { unsigned long *p = map + BIT_WORD(start); - const int size = start + nr; + const unsigned long size = start + nr; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); - while (nr - bits_to_clear >= 0) { + while (nr >= bits_to_clear) { if (clear_bits_ll(p, mask_to_clear)) return nr; nr -= bits_to_clear; @@ -183,8 +184,8 @@ int gen_pool_add_owner(struct gen_pool *pool, unsigned long virt, phys_addr_t ph size_t size, int nid, void *owner) { struct gen_pool_chunk *chunk; - int nbits = size >> pool->min_alloc_order; - int nbytes = sizeof(struct gen_pool_chunk) + + unsigned long nbits = size >> pool->min_alloc_order; + unsigned long nbytes = sizeof(struct gen_pool_chunk) + BITS_TO_LONGS(nbits) * sizeof(long); chunk = vzalloc_node(nbytes, nid); @@ -242,7 +243,7 @@ void gen_pool_destroy(struct gen_pool *pool) struct list_head *_chunk, *_next_chunk; struct gen_pool_chunk *chunk; int order = pool->min_alloc_order; - int bit, end_bit; + unsigned long bit, end_bit; list_for_each_safe(_chunk, _next_chunk, &pool->chunks) { chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); @@ -278,7 +279,7 @@ unsigned long gen_pool_alloc_algo_owner(struct gen_pool *pool, size_t size, struct gen_pool_chunk *chunk; unsigned long addr = 0; int order = pool->min_alloc_order; - int nbits, start_bit, end_bit, remain; + unsigned long nbits, start_bit, end_bit, remain; #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG BUG_ON(in_nmi()); @@ -487,7 +488,7 @@ void gen_pool_free_owner(struct gen_pool *pool, unsigned long addr, size_t size, { struct gen_pool_chunk *chunk; int order = pool->min_alloc_order; - int start_bit, nbits, remain; + unsigned long start_bit, nbits, remain; #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG BUG_ON(in_nmi()); @@ -755,7 +756,7 @@ unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, index = bitmap_find_next_zero_area(map, size, start, nr, 0); while (index < size) { - int next_bit = find_next_bit(map, size, index + nr); + unsigned long next_bit = find_next_bit(map, size, index + nr); if ((next_bit - index) < len) { len = next_bit - index; start_bit = index; From f0bb29e8c4076444d32df00c8d32e169ceecf283 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Tue, 29 Dec 2020 15:15:01 -0800 Subject: [PATCH 249/547] lib/zlib: fix inflating zlib streams on s390 Decompressing zlib streams on s390 fails with "incorrect data check" error. Userspace zlib checks inflate_state.flags in order to byteswap checksums only for zlib streams, and s390 hardware inflate code, which was ported from there, tries to match this behavior. At the same time, kernel zlib does not use inflate_state.flags, so it contains essentially random values. For many use cases either zlib stream is zeroed out or checksum is not used, so this problem is masked, but at least SquashFS is still affected. Fix by always passing a checksum to and from the hardware as is, which matches zlib_inflate()'s expectations. Link: https://lkml.kernel.org/r/20201215155551.894884-1-iii@linux.ibm.com Fixes: 126196100063 ("lib/zlib: add s390 hardware support for kernel zlib_inflate") Signed-off-by: Ilya Leoshkevich Tested-by: Christian Borntraeger Acked-by: Mikhail Zaslonko Acked-by: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Mikhail Zaslonko Cc: [5.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/zlib_dfltcc/dfltcc_inflate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/zlib_dfltcc/dfltcc_inflate.c b/lib/zlib_dfltcc/dfltcc_inflate.c index db107016d29b..fb60b5a6a1cb 100644 --- a/lib/zlib_dfltcc/dfltcc_inflate.c +++ b/lib/zlib_dfltcc/dfltcc_inflate.c @@ -125,7 +125,7 @@ dfltcc_inflate_action dfltcc_inflate( param->ho = (state->write - state->whave) & ((1 << HB_BITS) - 1); if (param->hl) param->nt = 0; /* Honor history for the first block */ - param->cv = state->flags ? REVERSE(state->check) : state->check; + param->cv = state->check; /* Inflate */ do { @@ -138,7 +138,7 @@ dfltcc_inflate_action dfltcc_inflate( state->bits = param->sbb; state->whave = param->hl; state->write = (param->ho + param->hl) & ((1 << HB_BITS) - 1); - state->check = state->flags ? REVERSE(param->cv) : param->cv; + state->check = param->cv; if (cc == DFLTCC_CC_OP2_CORRUPT && param->oesc != 0) { /* Report an error if stream is corrupted */ state->mode = BAD; From 605cc30dea249edf1b659e7d0146a2cf13cbbf71 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 29 Dec 2020 15:15:04 -0800 Subject: [PATCH 250/547] zlib: move EXPORT_SYMBOL() and MODULE_LICENSE() out of dfltcc_syms.c In commit 11fb479ff5d9 ("zlib: export S390 symbols for zlib modules"), I added EXPORT_SYMBOL()s to dfltcc_inflate.c but then Mikhail said that these should probably be in dfltcc_syms.c with the other EXPORT_SYMBOL()s. However, that is contrary to the current kernel style, which places EXPORT_SYMBOL() immediately after the function that it applies to, so move all EXPORT_SYMBOL()s to their respective function locations and drop the dfltcc_syms.c file. Also move MODULE_LICENSE() from the deleted file to dfltcc.c. [rdunlap@infradead.org: remove dfltcc_syms.o from Makefile] Link: https://lkml.kernel.org/r/20201227171837.15492-1-rdunlap@infradead.org Link: https://lkml.kernel.org/r/20201219052530.28461-1-rdunlap@infradead.org Fixes: 11fb479ff5d9 ("zlib: export S390 symbols for zlib modules") Signed-off-by: Randy Dunlap Cc: Acked-by: Ilya Leoshkevich Acked-by: Christian Borntraeger Cc: Zaslonko Mikhail Cc: Heiko Carstens Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/zlib_dfltcc/Makefile | 2 +- lib/zlib_dfltcc/dfltcc.c | 6 +++++- lib/zlib_dfltcc/dfltcc_deflate.c | 3 +++ lib/zlib_dfltcc/dfltcc_syms.c | 17 ----------------- 4 files changed, 9 insertions(+), 19 deletions(-) delete mode 100644 lib/zlib_dfltcc/dfltcc_syms.c diff --git a/lib/zlib_dfltcc/Makefile b/lib/zlib_dfltcc/Makefile index 8e4d5afbbb10..66e1c96387c4 100644 --- a/lib/zlib_dfltcc/Makefile +++ b/lib/zlib_dfltcc/Makefile @@ -8,4 +8,4 @@ obj-$(CONFIG_ZLIB_DFLTCC) += zlib_dfltcc.o -zlib_dfltcc-objs := dfltcc.o dfltcc_deflate.o dfltcc_inflate.o dfltcc_syms.o +zlib_dfltcc-objs := dfltcc.o dfltcc_deflate.o dfltcc_inflate.o diff --git a/lib/zlib_dfltcc/dfltcc.c b/lib/zlib_dfltcc/dfltcc.c index c30de430b30c..782f76e9d4da 100644 --- a/lib/zlib_dfltcc/dfltcc.c +++ b/lib/zlib_dfltcc/dfltcc.c @@ -1,7 +1,8 @@ // SPDX-License-Identifier: Zlib /* dfltcc.c - SystemZ DEFLATE CONVERSION CALL support. */ -#include +#include +#include #include "dfltcc_util.h" #include "dfltcc.h" @@ -53,3 +54,6 @@ void dfltcc_reset( dfltcc_state->dht_threshold = DFLTCC_DHT_MIN_SAMPLE_SIZE; dfltcc_state->param.ribm = DFLTCC_RIBM; } +EXPORT_SYMBOL(dfltcc_reset); + +MODULE_LICENSE("GPL"); diff --git a/lib/zlib_dfltcc/dfltcc_deflate.c b/lib/zlib_dfltcc/dfltcc_deflate.c index 00c185101c6d..6c946e8532ee 100644 --- a/lib/zlib_dfltcc/dfltcc_deflate.c +++ b/lib/zlib_dfltcc/dfltcc_deflate.c @@ -4,6 +4,7 @@ #include "dfltcc_util.h" #include "dfltcc.h" #include +#include #include /* @@ -34,6 +35,7 @@ int dfltcc_can_deflate( return 1; } +EXPORT_SYMBOL(dfltcc_can_deflate); static void dfltcc_gdht( z_streamp strm @@ -277,3 +279,4 @@ again: goto again; /* deflate() must use all input or all output */ return 1; } +EXPORT_SYMBOL(dfltcc_deflate); diff --git a/lib/zlib_dfltcc/dfltcc_syms.c b/lib/zlib_dfltcc/dfltcc_syms.c deleted file mode 100644 index 6f23481804c1..000000000000 --- a/lib/zlib_dfltcc/dfltcc_syms.c +++ /dev/null @@ -1,17 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/lib/zlib_dfltcc/dfltcc_syms.c - * - * Exported symbols for the s390 zlib dfltcc support. - * - */ - -#include -#include -#include -#include "dfltcc.h" - -EXPORT_SYMBOL(dfltcc_can_deflate); -EXPORT_SYMBOL(dfltcc_deflate); -EXPORT_SYMBOL(dfltcc_reset); -MODULE_LICENSE("GPL"); From 1f3147b49d75b47b6be54a1e6dfa87a4921e1e51 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 29 Dec 2020 15:15:07 -0800 Subject: [PATCH 251/547] mm: slub: call account_slab_page() after slab page initialization It's convenient to have page->objects initialized before calling into account_slab_page(). In particular, this information can be used to pre-alloc the obj_cgroup vector. Let's call account_slab_page() a bit later, after the initialization of page->objects. This commit doesn't bring any functional change, but is required for further optimizations. [akpm@linux-foundation.org: undo changes needed by forthcoming mm-memcg-slab-pre-allocate-obj_cgroups-for-slab-caches-with-slab_account.patch] Link: https://lkml.kernel.org/r/20201110195753.530157-1-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 0c8b43a5b3b0..dc5b42e700b8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1619,9 +1619,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, else page = __alloc_pages_node(node, flags, order); - if (page) - account_slab_page(page, order, s); - return page; } @@ -1774,6 +1771,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->objects = oo_objects(oo); + account_slab_page(page, oo_order(oo), s); + page->slab_cache = s; __SetPageSlab(page); if (page_is_pfmemalloc(page)) From 875b2376fd663832bf45f7285c9d26cb8c52929a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Dec 2020 19:47:06 -0800 Subject: [PATCH 252/547] fs: block_dev.c: fix kernel-doc warnings from struct block_device changes Fix new kernel-doc warnings in fs/block_dev.c: ../fs/block_dev.c:1066: warning: Excess function parameter 'whole' description in 'bd_abort_claiming' ../fs/block_dev.c:1837: warning: Function parameter or member 'dev' not described in 'lookup_bdev' Fixes: 4e7b5671c6a8 ("block: remove i_bdev") Fixes: 37c3fc9abb25 ("block: simplify the block device claiming interface") Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: Christoph Hellwig Cc: linux-fsdevel@vger.kernel.org Cc: Alexander Viro Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 9293045e128c..3e5b02f6606c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1055,7 +1055,6 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder) /** * bd_abort_claiming - abort claiming of a block device * @bdev: block device of interest - * @whole: whole block device * @holder: holder that has claimed @bdev * * Abort claiming of a block device when the exclusive open failed. This can be @@ -1828,6 +1827,7 @@ const struct file_operations def_blk_fops = { /** * lookup_bdev - lookup a struct block_device by name * @pathname: special file representing the block device + * @dev: return value of the block device's dev_t * * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) From dc30432605bbbd486dfede3852ea4d42c40a84b4 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 28 Dec 2020 11:27:18 -0800 Subject: [PATCH 253/547] block: add debugfs stanza for QUEUE_FLAG_NOWAIT This was missed in 021a24460dc2. Leads to the numeric value of QUEUE_FLAG_NOWAIT (i.e. 29) showing up in /sys/kernel/debug/block/*/state. Fixes: 021a24460dc28e7412aecfae89f60e1847e685c0 Cc: Konstantin Khlebnikov Cc: Mike Snitzer Cc: Christoph Hellwig Cc: Jens Axboe Signed-off-by: Andres Freund Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3094542e12ae..e21eed20a155 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -129,6 +129,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(PCI_P2PDMA), QUEUE_FLAG_NAME(ZONE_RESETALL), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), + QUEUE_FLAG_NAME(NOWAIT), }; #undef QUEUE_FLAG_NAME From 44362a3c353aeec5904c2ae6d1737f20fe7e9c79 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 23 Dec 2020 12:08:54 +0000 Subject: [PATCH 254/547] KVM: arm64: Fix hyp_cpu_pm_{init,exit} __init annotation The __init annotations on hyp_cpu_pm_{init,exit} are obviously incorrect, and the build system shouts at you if you enable DEBUG_SECTION_MISMATCH. Nothing really bad happens as we never execute that code outside of the init context, but we can't label the callers as __int either, as kvm_init isn't __init itself. Oh well. Signed-off-by: Marc Zyngier Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20201223120854.255347-1-maz@kernel.org --- arch/arm64/kvm/arm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ab782c480e9a..04c44853b103 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1568,12 +1568,12 @@ static struct notifier_block hyp_init_cpu_pm_nb = { .notifier_call = hyp_init_cpu_pm_notifier, }; -static void __init hyp_cpu_pm_init(void) +static void hyp_cpu_pm_init(void) { if (!is_protected_kvm_enabled()) cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); } -static void __init hyp_cpu_pm_exit(void) +static void hyp_cpu_pm_exit(void) { if (!is_protected_kvm_enabled()) cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); From 6820e812dafb4258bc14692f686eec5bde6fba86 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Wed, 16 Dec 2020 11:23:21 +0200 Subject: [PATCH 255/547] spi: Fix the clamping of spi->max_speed_hz If spi->controller->max_speed_hz is zero, a non-zero spi->max_speed_hz will be overwritten by zero. Make sure spi->controller->max_speed_hz is not zero when clamping spi->max_speed_hz. Put the spi->controller->max_speed_hz non-zero check higher in the if, so that we avoid a superfluous init to zero when both spi->max_speed_hz and spi->controller->max_speed_hz are zero. Fixes: 9326e4f1e5dd ("spi: Limit the spi device max speed to controller's max speed") Reported-by: Geert Uytterhoeven Suggested-by: Geert Uytterhoeven Signed-off-by: Tudor Ambarus Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20201216092321.413262-1-tudor.ambarus@microchip.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 51d7c004fbab..f59bf5094adb 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -3378,8 +3378,9 @@ int spi_setup(struct spi_device *spi) if (status) return status; - if (!spi->max_speed_hz || - spi->max_speed_hz > spi->controller->max_speed_hz) + if (spi->controller->max_speed_hz && + (!spi->max_speed_hz || + spi->max_speed_hz > spi->controller->max_speed_hz)) spi->max_speed_hz = spi->controller->max_speed_hz; mutex_lock(&spi->controller->io_mutex); From e042f151ec7474b88b8c1edaaddd1ff7415d7117 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Dec 2020 19:54:28 -0800 Subject: [PATCH 256/547] hwmon: (sbtsi_temp) Fix Documenation kernel-doc warning Fix Documentation/hwmon/ kernel-doc warning in 5.11-rc1: lnx-511-rc1/Documentation/hwmon/sbtsi_temp.rst:4: WARNING: Title underline too short. Kernel driver sbtsi_temp ================== Fixes: 6ec3fcf556fe ("hwmon: (sbtsi) Add documentation") Signed-off-by: Randy Dunlap Cc: Kun Yi Cc: Guenter Roeck Cc: Jean Delvare Cc: linux-hwmon@vger.kernel.org Link: https://lore.kernel.org/r/20201229035428.31270-1-rdunlap@infradead.org Signed-off-by: Guenter Roeck --- Documentation/hwmon/sbtsi_temp.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/hwmon/sbtsi_temp.rst b/Documentation/hwmon/sbtsi_temp.rst index 922b3c8db666..749f518389c3 100644 --- a/Documentation/hwmon/sbtsi_temp.rst +++ b/Documentation/hwmon/sbtsi_temp.rst @@ -1,7 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0-or-later Kernel driver sbtsi_temp -================== +======================== Supported hardware: From 742eb4750ff35fd62784b04b675d672b8dee2524 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 18 Nov 2020 21:23:33 +0100 Subject: [PATCH 257/547] s390: update defconfigs Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/configs/debug_defconfig | 12 +++++++++--- arch/s390/configs/defconfig | 11 +++++++---- arch/s390/configs/zfcpdump_defconfig | 2 ++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 1be32fcf6f2e..c4f6ff98a612 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -61,7 +61,9 @@ CONFIG_OPROFILE=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_STATIC_KEYS_SELFTEST=y +CONFIG_SECCOMP_CACHE_DEBUG=y CONFIG_LOCK_EVENT_COUNTS=y +# CONFIG_GCC_PLUGINS is not set CONFIG_MODULES=y CONFIG_MODULE_FORCE_LOAD=y CONFIG_MODULE_UNLOAD=y @@ -410,12 +412,12 @@ CONFIG_SCSI_ENCLOSURE=m CONFIG_SCSI_CONSTANTS=y CONFIG_SCSI_LOGGING=y CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=y +CONFIG_SCSI_FC_ATTRS=m CONFIG_SCSI_SAS_LIBSAS=m CONFIG_SCSI_SRP_ATTRS=m CONFIG_ISCSI_TCP=m CONFIG_SCSI_DEBUG=m -CONFIG_ZFCP=y +CONFIG_ZFCP=m CONFIG_SCSI_VIRTIO=m CONFIG_SCSI_DH=y CONFIG_SCSI_DH_RDAC=m @@ -444,6 +446,7 @@ CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_QL=m CONFIG_DM_MULTIPATH_ST=m CONFIG_DM_MULTIPATH_HST=m +CONFIG_DM_MULTIPATH_IOA=m CONFIG_DM_DELAY=m CONFIG_DM_UEVENT=y CONFIG_DM_FLAKEY=m @@ -542,7 +545,6 @@ CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 -CONFIG_NULL_TTY=m CONFIG_VIRTIO_CONSOLE=y CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m @@ -574,6 +576,7 @@ CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m +# CONFIG_SURFACE_PLATFORMS is not set CONFIG_S390_CCW_IOMMU=y CONFIG_S390_AP_IOMMU=y CONFIG_EXT4_FS=y @@ -655,6 +658,7 @@ CONFIG_CIFS_XATTR=y CONFIG_CIFS_POSIX=y # CONFIG_CIFS_DEBUG is not set CONFIG_CIFS_DFS_UPCALL=y +CONFIG_CIFS_SWN_UPCALL=y CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -826,6 +830,8 @@ CONFIG_FTRACE_SYSCALLS=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BPF_KPROBE_OVERRIDE=y CONFIG_HIST_TRIGGERS=y +CONFIG_FTRACE_STARTUP_TEST=y +# CONFIG_EVENT_TRACE_STARTUP_TEST is not set CONFIG_DEBUG_USER_ASCE=y CONFIG_NOTIFIER_ERROR_INJECTION=m CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index e2171a008809..51135893cffe 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -58,6 +58,7 @@ CONFIG_S390_UNWIND_SELFTEST=m CONFIG_OPROFILE=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y +# CONFIG_GCC_PLUGINS is not set CONFIG_MODULES=y CONFIG_MODULE_FORCE_LOAD=y CONFIG_MODULE_UNLOAD=y @@ -95,7 +96,6 @@ CONFIG_ZSMALLOC_STAT=y CONFIG_DEFERRED_STRUCT_PAGE_INIT=y CONFIG_IDLE_PAGE_TRACKING=y CONFIG_PERCPU_STATS=y -CONFIG_GUP_TEST=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_PACKET_DIAG=m @@ -403,12 +403,12 @@ CONFIG_SCSI_ENCLOSURE=m CONFIG_SCSI_CONSTANTS=y CONFIG_SCSI_LOGGING=y CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=y +CONFIG_SCSI_FC_ATTRS=m CONFIG_SCSI_SAS_LIBSAS=m CONFIG_SCSI_SRP_ATTRS=m CONFIG_ISCSI_TCP=m CONFIG_SCSI_DEBUG=m -CONFIG_ZFCP=y +CONFIG_ZFCP=m CONFIG_SCSI_VIRTIO=m CONFIG_SCSI_DH=y CONFIG_SCSI_DH_RDAC=m @@ -437,6 +437,7 @@ CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_QL=m CONFIG_DM_MULTIPATH_ST=m CONFIG_DM_MULTIPATH_HST=m +CONFIG_DM_MULTIPATH_IOA=m CONFIG_DM_DELAY=m CONFIG_DM_UEVENT=y CONFIG_DM_FLAKEY=m @@ -536,7 +537,6 @@ CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 -CONFIG_NULL_TTY=m CONFIG_VIRTIO_CONSOLE=y CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m @@ -568,6 +568,7 @@ CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m +# CONFIG_SURFACE_PLATFORMS is not set CONFIG_S390_CCW_IOMMU=y CONFIG_S390_AP_IOMMU=y CONFIG_EXT4_FS=y @@ -645,6 +646,7 @@ CONFIG_CIFS_XATTR=y CONFIG_CIFS_POSIX=y # CONFIG_CIFS_DEBUG is not set CONFIG_CIFS_DFS_UPCALL=y +CONFIG_CIFS_SWN_UPCALL=y CONFIG_NLS_DEFAULT="utf8" CONFIG_NLS_CODEPAGE_437=m CONFIG_NLS_CODEPAGE_850=m @@ -778,6 +780,7 @@ CONFIG_FTRACE_SYSCALLS=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BPF_KPROBE_OVERRIDE=y CONFIG_HIST_TRIGGERS=y +CONFIG_DEBUG_USER_ASCE=y CONFIG_LKDTM=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index a302630341ef..1ef211dae77a 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -22,6 +22,7 @@ CONFIG_CRASH_DUMP=y # CONFIG_VIRTUALIZATION is not set # CONFIG_S390_GUEST is not set # CONFIG_SECCOMP is not set +# CONFIG_GCC_PLUGINS is not set CONFIG_PARTITION_ADVANCED=y CONFIG_IBM_PARTITION=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set @@ -58,6 +59,7 @@ CONFIG_RAW_DRIVER=y # CONFIG_HID is not set # CONFIG_VIRTIO_MENU is not set # CONFIG_VHOST_MENU is not set +# CONFIG_SURFACE_PLATFORMS is not set # CONFIG_IOMMU_SUPPORT is not set # CONFIG_DNOTIFY is not set # CONFIG_INOTIFY_USER is not set From 129975e75b9a2ba528d7f58be2e338cd644f6ed8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 28 Dec 2020 09:51:57 +0100 Subject: [PATCH 258/547] s390/Kconfig: sort config S390 select list once again ...and add comments at the top and bottom. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index e84bdd15150b..c72874f09741 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -54,17 +54,23 @@ config KASAN_SHADOW_OFFSET config S390 def_bool y + # + # Note: keep this list sorted alphabetically + # + imply IMA_SECURE_AND_OR_TRUSTED_BOOT select ARCH_BINFMT_ELF_STATE select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_FORCE_DMA_UNENCRYPTED select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_SCALED_CPUTIME select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX @@ -111,8 +117,10 @@ config S390 select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 + select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES select DMA_OPS if PCI select DYNAMIC_FTRACE if FUNCTION_TRACER + select GENERIC_ALLOCATOR select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES select GENERIC_FIND_FIRST_BIT @@ -126,22 +134,21 @@ config S390 select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_VMALLOC - select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_VMAP_STACK select HAVE_ASM_MODVERSIONS - select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS - select HAVE_FAST_GUP + select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_FAST_GUP select HAVE_FENTRY select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_ERROR_INJECTION @@ -163,16 +170,15 @@ config S390 select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH - select HAVE_PERF_REGS - select HAVE_PERF_USER_STACK_DUMP select HAVE_MEMBLOCK_PHYS_MAP - select MMU_GATHER_NO_GATHER select HAVE_MOD_ARCH_SPECIFIC + select HAVE_NMI select HAVE_NOP_MCOUNT select HAVE_OPROFILE select HAVE_PCI select HAVE_PERF_EVENTS - select MMU_GATHER_RCU_TABLE_FREE + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ @@ -181,6 +187,8 @@ config S390 select HAVE_VIRT_CPU_ACCOUNTING_IDLE select IOMMU_HELPER if PCI select IOMMU_SUPPORT if PCI + select MMU_GATHER_NO_GATHER + select MMU_GATHER_RCU_TABLE_FREE select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PCI select NEED_SG_DMA_LENGTH if PCI @@ -190,17 +198,12 @@ config S390 select PCI_MSI if PCI select PCI_MSI_ARCH_FALLBACKS if PCI_MSI select SPARSE_IRQ + select SWIOTLB select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select TTY select VIRT_CPU_ACCOUNTING - select ARCH_HAS_SCALED_CPUTIME - select HAVE_NMI - select ARCH_HAS_FORCE_DMA_UNENCRYPTED - select SWIOTLB - select GENERIC_ALLOCATOR - imply IMA_SECURE_AND_OR_TRUSTED_BOOT - + # Note: keep the above list sorted alphabetically config SCHED_OMIT_FRAME_POINTER def_bool y From 1eda52334e6d13eb1a85f713ce06dd39342b5020 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 15 Dec 2020 10:20:30 +0100 Subject: [PATCH 259/547] hwmon: (pwm-fan) Ensure that calculation doesn't discard big period values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With MAX_PWM being defined to 255 the code unsigned long period; ... period = ctx->pwm->args.period; state.duty_cycle = DIV_ROUND_UP(pwm * (period - 1), MAX_PWM); calculates a too small value for duty_cycle if the configured period is big (either by discarding the 64 bit value ctx->pwm->args.period or by overflowing the multiplication). As this results in a too slow fan and so maybe an overheating machine better be safe than sorry and error out in .probe. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20201215092031.152243-1-u.kleine-koenig@pengutronix.de Signed-off-by: Guenter Roeck --- drivers/hwmon/pwm-fan.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/pwm-fan.c b/drivers/hwmon/pwm-fan.c index 777439f48c14..111a91dc6b79 100644 --- a/drivers/hwmon/pwm-fan.c +++ b/drivers/hwmon/pwm-fan.c @@ -334,8 +334,18 @@ static int pwm_fan_probe(struct platform_device *pdev) ctx->pwm_value = MAX_PWM; - /* Set duty cycle to maximum allowed and enable PWM output */ pwm_init_state(ctx->pwm, &state); + /* + * __set_pwm assumes that MAX_PWM * (period - 1) fits into an unsigned + * long. Check this here to prevent the fan running at a too low + * frequency. + */ + if (state.period > ULONG_MAX / MAX_PWM + 1) { + dev_err(dev, "Configured period too big\n"); + return -EINVAL; + } + + /* Set duty cycle to maximum allowed and enable PWM output */ state.duty_cycle = ctx->pwm->args.period - 1; state.enabled = true; From c318840fb2a42ce25febc95c4c19357acf1ae5ca Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 30 Dec 2020 11:20:44 -0500 Subject: [PATCH 260/547] USB: Gadget: dummy-hcd: Fix shift-out-of-bounds bug The dummy-hcd driver was written under the assumption that all the parameters in URBs sent to its root hub would be valid. With URBs sent from userspace via usbfs, that assumption can be violated. In particular, the driver doesn't fully check the port-feature values stored in the wValue entry of Clear-Port-Feature and Set-Port-Feature requests. Values that are too large can cause the driver to perform an invalid left shift of more than 32 bits. Ironically, two of those left shifts are unnecessary, because they implement Set-Port-Feature requests that hubs are not required to support, according to section 11.24.2.13 of the USB-2.0 spec. This patch adds the appropriate checks for the port feature selector values and removes the unnecessary feature settings. It also rejects requests to set the TEST feature or to set or clear the INDICATOR and C_OVERCURRENT features, as none of these are relevant to dummy-hcd's root-hub emulation. CC: Reported-and-tested-by: syzbot+5925509f78293baa7331@syzkaller.appspotmail.com Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/20201230162044.GA727759@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/dummy_hcd.c | 35 ++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index ab5e978b5052..1a953f44183a 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -2118,9 +2118,21 @@ static int dummy_hub_control( dum_hcd->port_status &= ~USB_PORT_STAT_POWER; set_link_state(dum_hcd); break; - default: + case USB_PORT_FEAT_ENABLE: + case USB_PORT_FEAT_C_ENABLE: + case USB_PORT_FEAT_C_SUSPEND: + /* Not allowed for USB-3 */ + if (hcd->speed == HCD_USB3) + goto error; + fallthrough; + case USB_PORT_FEAT_C_CONNECTION: + case USB_PORT_FEAT_C_RESET: dum_hcd->port_status &= ~(1 << wValue); set_link_state(dum_hcd); + break; + default: + /* Disallow INDICATOR and C_OVER_CURRENT */ + goto error; } break; case GetHubDescriptor: @@ -2281,18 +2293,17 @@ static int dummy_hub_control( */ dum_hcd->re_timeout = jiffies + msecs_to_jiffies(50); fallthrough; + case USB_PORT_FEAT_C_CONNECTION: + case USB_PORT_FEAT_C_RESET: + case USB_PORT_FEAT_C_ENABLE: + case USB_PORT_FEAT_C_SUSPEND: + /* Not allowed for USB-3, and ignored for USB-2 */ + if (hcd->speed == HCD_USB3) + goto error; + break; default: - if (hcd->speed == HCD_USB3) { - if ((dum_hcd->port_status & - USB_SS_PORT_STAT_POWER) != 0) { - dum_hcd->port_status |= (1 << wValue); - } - } else - if ((dum_hcd->port_status & - USB_PORT_STAT_POWER) != 0) { - dum_hcd->port_status |= (1 << wValue); - } - set_link_state(dum_hcd); + /* Disallow TEST, INDICATOR, and C_OVER_CURRENT */ + goto error; } break; case GetPortErrorCount: From be1283454b61a1f3b089f1a74b73e20532262e32 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 29 Dec 2020 18:08:18 +0100 Subject: [PATCH 261/547] cpufreq: intel_pstate: Fix fast-switch fallback path When sugov_update_single_perf() falls back to the "frequency" path due to the missing scale-invariance, it will call cpufreq_driver_fast_switch() via sugov_fast_switch() and the driver's ->fast_switch() callback will be invoked, so it must not be NULL. However, after commit a365ab6b9dfb ("cpufreq: intel_pstate: Implement the ->adjust_perf() callback") intel_pstate sets ->fast_switch() to NULL when it is going to use intel_cpufreq_adjust_perf(), which is a mistake, because on x86 the scale-invariance may be turned off dynamically, so modify it to retain the original ->adjust_perf() callback pointer. Fixes: a365ab6b9dfb ("cpufreq: intel_pstate: Implement the ->adjust_perf() callback") Reported-by: Kenneth R. Crudup Tested-by: Kenneth R. Crudup Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 6e23376548ce..1a660466dd75 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -3086,7 +3086,6 @@ static int __init intel_pstate_init(void) intel_pstate.attr = hwp_cpufreq_attrs; intel_cpufreq.attr = hwp_cpufreq_attrs; intel_cpufreq.flags |= CPUFREQ_NEED_UPDATE_LIMITS; - intel_cpufreq.fast_switch = NULL; intel_cpufreq.adjust_perf = intel_cpufreq_adjust_perf; if (!default_driver) default_driver = &intel_pstate; From 9cf93f056f783f986c19f40d5304d1bcffa0fc0d Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 27 Dec 2020 12:11:16 +0200 Subject: [PATCH 262/547] intel_idle: add SnowRidge C-state table Add C-state table for the SnowRidge SoC which is found on Intel Jacobsville platforms. The following has been changed. 1. C1E latency changed from 10us to 15us. It was measured using the open source "wult" tool (the "nic" method, 15us is the 99.99th percentile). 2. C1E power break even changed from 20us to 25us, which may result in less C1E residency in some workloads. 3. C6 latency changed from 50us to 130us. Measured the same way as C1E. The C6 C-state is supported only by some SnowRidge revisions, so add a C-state table commentary about this. On SnowRidge, C6 support is enumerated via the usual mechanism: "mwait" leaf of the "cpuid" instruction. The 'intel_idle' driver does check this leaf, so even though C6 is present in the table, the driver will only use it if the CPU does support it. Signed-off-by: Artem Bityutskiy Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 41 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index d79335506ecd..28f93b9aa51b 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -963,6 +963,39 @@ static struct cpuidle_state dnv_cstates[] __initdata = { .enter = NULL } }; +/* + * Note, depending on HW and FW revision, SnowRidge SoC may or may not support + * C6, and this is indicated in the CPUID mwait leaf. + */ +static struct cpuidle_state snr_cstates[] __initdata = { + { + .name = "C1", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 2, + .target_residency = 2, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C1E", + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 15, + .target_residency = 25, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .name = "C6", + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 130, + .target_residency = 500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { + .enter = NULL } +}; + static const struct idle_cpu idle_cpu_nehalem __initconst = { .state_table = nehalem_cstates, .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE, @@ -1084,6 +1117,12 @@ static const struct idle_cpu idle_cpu_dnv __initconst = { .use_acpi = true, }; +static const struct idle_cpu idle_cpu_snr __initconst = { + .state_table = snr_cstates, + .disable_promotion_to_c1e = true, + .use_acpi = true, +}; + static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &idle_cpu_nhx), X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &idle_cpu_nehalem), @@ -1122,7 +1161,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &idle_cpu_bxt), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &idle_cpu_dnv), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &idle_cpu_dnv), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &idle_cpu_snr), {} }; From 1642b4450d20e31439c80c28256c8eee08684698 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Dec 2020 21:34:14 +0000 Subject: [PATCH 263/547] io_uring: add a helper for setting a ref node Setting a new reference node to a file data is not trivial, don't repeat it, add and use a helper. Cc: stable@vger.kernel.org # 5.6+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index eb4620ff638e..6372aba8d0c2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7231,6 +7231,16 @@ static void io_file_ref_kill(struct percpu_ref *ref) complete(&data->done); } +static void io_sqe_files_set_node(struct fixed_file_data *file_data, + struct fixed_file_ref_node *ref_node) +{ + spin_lock_bh(&file_data->lock); + file_data->node = ref_node; + list_add_tail(&ref_node->node, &file_data->ref_list); + spin_unlock_bh(&file_data->lock); + percpu_ref_get(&file_data->refs); +} + static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { struct fixed_file_data *data = ctx->file_data; @@ -7758,11 +7768,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return PTR_ERR(ref_node); } - file_data->node = ref_node; - spin_lock_bh(&file_data->lock); - list_add_tail(&ref_node->node, &file_data->ref_list); - spin_unlock_bh(&file_data->lock); - percpu_ref_get(&file_data->refs); + io_sqe_files_set_node(file_data, ref_node); return ret; out_fput: for (i = 0; i < ctx->nr_user_files; i++) { @@ -7918,11 +7924,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (needs_switch) { percpu_ref_kill(&data->node->refs); - spin_lock_bh(&data->lock); - list_add_tail(&ref_node->node, &data->ref_list); - data->node = ref_node; - spin_unlock_bh(&data->lock); - percpu_ref_get(&ctx->file_data->refs); + io_sqe_files_set_node(data, ref_node); } else destroy_fixed_file_ref_node(ref_node); From 1ffc54220c444774b7f09e6d2121e732f8e19b94 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Dec 2020 21:34:15 +0000 Subject: [PATCH 264/547] io_uring: fix io_sqe_files_unregister() hangs io_sqe_files_unregister() uninterruptibly waits for enqueued ref nodes, however requests keeping them may never complete, e.g. because of some userspace dependency. Make sure it's interruptible otherwise it would hang forever. Cc: stable@vger.kernel.org # 5.6+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6372aba8d0c2..ca46f314640b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -992,6 +992,10 @@ enum io_mem_account { ACCT_PINNED, }; +static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node); +static struct fixed_file_ref_node *alloc_fixed_file_ref_node( + struct io_ring_ctx *ctx); + static void __io_complete_rw(struct io_kiocb *req, long res, long res2, struct io_comp_state *cs); static void io_cqring_fill_event(struct io_kiocb *req, long res); @@ -7244,11 +7248,15 @@ static void io_sqe_files_set_node(struct fixed_file_data *file_data, static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { struct fixed_file_data *data = ctx->file_data; - struct fixed_file_ref_node *ref_node = NULL; + struct fixed_file_ref_node *backup_node, *ref_node = NULL; unsigned nr_tables, i; + int ret; if (!data) return -ENXIO; + backup_node = alloc_fixed_file_ref_node(ctx); + if (!backup_node) + return -ENOMEM; spin_lock_bh(&data->lock); ref_node = data->node; @@ -7260,7 +7268,18 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) /* wait for all refs nodes to complete */ flush_delayed_work(&ctx->file_put_work); - wait_for_completion(&data->done); + do { + ret = wait_for_completion_interruptible(&data->done); + if (!ret) + break; + ret = io_run_task_work_sig(); + if (ret < 0) { + percpu_ref_resurrect(&data->refs); + reinit_completion(&data->done); + io_sqe_files_set_node(data, backup_node); + return ret; + } + } while (1); __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); @@ -7271,6 +7290,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) kfree(data); ctx->file_data = NULL; ctx->nr_user_files = 0; + destroy_fixed_file_ref_node(backup_node); return 0; } From b1b6b5a30dce872f500dc43f067cba8e7f86fc7d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Dec 2020 21:34:16 +0000 Subject: [PATCH 265/547] kernel/io_uring: cancel io_uring before task works For cancelling io_uring requests it needs either to be able to run currently enqueued task_works or having it shut down by that moment. Otherwise io_uring_cancel_files() may be waiting for requests that won't ever complete. Go with the first way and do cancellations before setting PF_EXITING and so before putting the task_work infrastructure into a transition state where task_work_run() would better not be called. Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/file.c | 2 -- kernel/exit.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/file.c b/fs/file.c index c0b60961c672..dab120b71e44 100644 --- a/fs/file.c +++ b/fs/file.c @@ -21,7 +21,6 @@ #include #include #include -#include unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -428,7 +427,6 @@ void exit_files(struct task_struct *tsk) struct files_struct * files = tsk->files; if (files) { - io_uring_files_cancel(files); task_lock(tsk); tsk->files = NULL; task_unlock(tsk); diff --git a/kernel/exit.c b/kernel/exit.c index 3594291a8542..04029e35e69a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -776,6 +777,7 @@ void __noreturn do_exit(long code) schedule(); } + io_uring_files_cancel(tsk->files); exit_signals(tsk); /* sets PF_EXITING */ /* sync mm's RSS info before statistics gathering */ From f93274ef0fe972c120c96b3207f8fce376231a60 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 4 Dec 2020 09:01:36 +0100 Subject: [PATCH 266/547] crypto: asym_tpm: correct zero out potential secrets The function derive_pub_key() should be calling memzero_explicit() instead of memset() in case the complier decides to optimize away the call to memset() because it "knows" no one is going to touch the memory anymore. Cc: stable Reported-by: Ilil Blum Shem-Tov Tested-by: Ilil Blum Shem-Tov Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/X8ns4AfwjKudpyfe@kroah.com Signed-off-by: Greg Kroah-Hartman --- crypto/asymmetric_keys/asym_tpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/asymmetric_keys/asym_tpm.c b/crypto/asymmetric_keys/asym_tpm.c index 511932aa94a6..0959613560b9 100644 --- a/crypto/asymmetric_keys/asym_tpm.c +++ b/crypto/asymmetric_keys/asym_tpm.c @@ -354,7 +354,7 @@ static uint32_t derive_pub_key(const void *pub_key, uint32_t len, uint8_t *buf) memcpy(cur, e, sizeof(e)); cur += sizeof(e); /* Zero parameters to satisfy set_pub_key ABI. */ - memset(cur, 0, SETKEY_PARAMS_SIZE); + memzero_explicit(cur, SETKEY_PARAMS_SIZE); return cur - buf; } From 744a11abc56405c5a106e63da30a941b6d27f737 Mon Sep 17 00:00:00 2001 From: bo liu Date: Tue, 29 Dec 2020 11:52:26 +0800 Subject: [PATCH 267/547] ALSA: hda/conexant: add a new hda codec CX11970 The current kernel does not support the cx11970 codec chip. Add a codec configuration item to kernel. [ Minor coding style fix by tiwai ] Signed-off-by: bo liu Cc: Link: https://lore.kernel.org/r/20201229035226.62120-1-bo.liu@senarytech.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_conexant.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index be5000dd1585..d49cc4409d59 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -1070,6 +1070,7 @@ static int patch_conexant_auto(struct hda_codec *codec) static const struct hda_device_id snd_hda_id_conexant[] = { HDA_CODEC_ENTRY(0x14f11f86, "CX8070", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f12008, "CX8200", patch_conexant_auto), + HDA_CODEC_ENTRY(0x14f120d0, "CX11970", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f15045, "CX20549 (Venice)", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f15047, "CX20551 (Waikiki)", patch_conexant_auto), HDA_CODEC_ENTRY(0x14f15051, "CX20561 (Hermosa)", patch_conexant_auto), From 484229585a5e91eeb00ee10e05d5204e1ca6c481 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Jim=C3=A9nez?= Date: Tue, 29 Dec 2020 15:38:56 +0100 Subject: [PATCH 268/547] ALSA: hda/realtek: Add mute LED quirk for more HP laptops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HP Pavilion 13-bb0000 (SSID 103c:87c8) needs the same quirk as other models with ALC287. Signed-off-by: Manuel Jiménez Cc: Link: https://lore.kernel.org/r/X+s/gKNydVrI6nLj@HP-Pavilion-13 Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index f49cf0f227c4..a7015b3a62c7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7969,6 +7969,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8760, "HP", ALC285_FIXUP_HP_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x877a, "HP", ALC285_FIXUP_HP_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x877d, "HP", ALC236_FIXUP_HP_MUTE_LED), + SND_PCI_QUIRK(0x103c, 0x87c8, "HP", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x87f4, "HP", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x87f5, "HP", ALC287_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), From ce2e79b223867b9e586021b55dee7035517a236b Mon Sep 17 00:00:00 2001 From: PeiSen Hou Date: Thu, 31 Dec 2020 11:57:28 +0100 Subject: [PATCH 269/547] ALSA: hda/realtek: Add two "Intel Reference board" SSID in the ALC256. Add two "Intel Reference boad" SSID in the alc256. Enable "power saving mode" and Enable "headset jack mode". Signed-off-by: PeiSen Hou Cc: Link: https://lore.kernel.org/r/5978d2267f034c28973d117925ec9c63@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a7015b3a62c7..71a85bf99055 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8027,6 +8027,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x10cf, 0x1845, "Lifebook U904", ALC269_FIXUP_LIFEBOOK_EXTMIC), SND_PCI_QUIRK(0x10ec, 0x10f2, "Intel Reference board", ALC700_FIXUP_INTEL_REFERENCE), SND_PCI_QUIRK(0x10ec, 0x1230, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), + SND_PCI_QUIRK(0x10ec, 0x1252, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), + SND_PCI_QUIRK(0x10ec, 0x1254, "Intel Reference board", ALC295_FIXUP_CHROME_BOOK), SND_PCI_QUIRK(0x10f7, 0x8338, "Panasonic CF-SZ6", ALC269_FIXUP_HEADSET_MODE), SND_PCI_QUIRK(0x144d, 0xc109, "Samsung Ativ book 9 (NP900X3G)", ALC269_FIXUP_INV_DMIC), SND_PCI_QUIRK(0x144d, 0xc169, "Samsung Notebook 9 Pen (NP930SBE-K01US)", ALC298_FIXUP_SAMSUNG_HEADPHONE_VERY_QUIET), From a598098cc9737f612dbab52294433fc26c51cc9b Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Wed, 30 Dec 2020 20:56:35 +0800 Subject: [PATCH 270/547] ALSA: hda/realtek: Enable mute and micmute LED on HP EliteBook 850 G7 HP EliteBook 850 G7 uses the same GPIO pins as ALC285_FIXUP_HP_GPIO_LED to enable mute and micmute LED. So apply the quirk to enable the LEDs. Signed-off-by: Kai-Heng Feng Cc: Link: https://lore.kernel.org/r/20201230125636.45028-1-kai.heng.feng@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 71a85bf99055..3c1d2a3fb1a4 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7964,6 +7964,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8497, "HP Envy x360", ALC269_FIXUP_HP_MUTE_LED_MIC3), SND_PCI_QUIRK(0x103c, 0x84e7, "HP Pavilion 15", ALC269_FIXUP_HP_MUTE_LED_MIC3), SND_PCI_QUIRK(0x103c, 0x869d, "HP", ALC236_FIXUP_HP_MUTE_LED), + SND_PCI_QUIRK(0x103c, 0x8724, "HP EliteBook 850 G7", ALC285_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8729, "HP", ALC285_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8736, "HP", ALC285_FIXUP_HP_GPIO_AMP_INIT), SND_PCI_QUIRK(0x103c, 0x8760, "HP", ALC285_FIXUP_HP_MUTE_LED), From 957cbca7317f7413e1bac555a6b567af06598b10 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 31 Dec 2020 15:05:46 +0000 Subject: [PATCH 271/547] KVM: arm64: Remove spurious semicolon in reg_to_encoding() Although not a problem right now, it flared up while working on some other aspects of the code-base. Remove the useless semicolon. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index d46e7f706cb0..42ccc27fb684 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -923,7 +923,7 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, #define reg_to_encoding(x) \ sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ - (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2); + (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2) /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ From 4f8af077a02eed4831885048a10e04daa4e61a72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Mon, 28 Dec 2020 14:46:07 +0000 Subject: [PATCH 272/547] docs: Fix reST markup when linking to sections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During the process of converting the documentation to reST, some links were converted using the following wrong syntax (and sometimes using %20 instead of spaces): `Display text <#section-name-in-html>`__ This syntax isn't valid according to the docutils' spec [1], but more importantly, it is specific to HTML, since it uses '#' to link to an HTML anchor. The right syntax would instead use a docutils hyperlink reference as the embedded URI to point to the section [2], that is: `Display text
`__ This syntax works in both HTML and PDF. The LaTeX toolchain doesn't mind the HTML anchor syntax when generating the pdf documentation (make pdfdocs), that is, the build succeeds but the links don't work, but that syntax causes errors when trying to build using the not-yet-merged rst2pdf: ValueError: format not resolved, probably missing URL scheme or undefined destination target for 'Forcing%20Quiescent%20States' So, use the correct syntax in order to have it work in all different output formats. [1]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#reference-names [2]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#embedded-uris-and-aliases Fixes: ccc9971e2147 ("docs: rcu: convert some articles from html to ReST") Fixes: c8cce10a62aa ("docs: Fix the reference labels in Locking.rst") Fixes: e548cdeffcd8 ("docs-rst: convert kernel-locking to ReST") Fixes: 7ddedebb03b7 ("ALSA: doc: ReSTize writing-an-alsa-driver document") Signed-off-by: Nícolas F. R. A. Prado Reviewed-by: Takashi Iwai Reviewed-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/20201228144537.135353-1-nfraprado@protonmail.com Signed-off-by: Jonathan Corbet --- .../Tree-RCU-Memory-Ordering.rst | 8 ++++---- .../RCU/Design/Requirements/Requirements.rst | 20 +++++++++---------- Documentation/kernel-hacking/locking.rst | 8 ++++---- .../kernel-api/writing-an-alsa-driver.rst | 16 +++++++-------- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index 83ae3b79a643..a648b423ba0e 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -473,7 +473,7 @@ read-side critical sections that follow the idle period (the oval near the bottom of the diagram above). Plumbing this into the full grace-period execution is described -`below <#Forcing%20Quiescent%20States>`__. +`below `__. CPU-Hotplug Interface ^^^^^^^^^^^^^^^^^^^^^ @@ -494,7 +494,7 @@ mask to detect CPUs having gone offline since the beginning of this grace period. Plumbing this into the full grace-period execution is described -`below <#Forcing%20Quiescent%20States>`__. +`below `__. Forcing Quiescent States ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -532,7 +532,7 @@ from other CPUs. | RCU. But this diagram is complex enough as it is, so simplicity | | overrode accuracy. You can think of it as poetic license, or you can | | think of it as misdirection that is resolved in the | -| `stitched-together diagram <#Putting%20It%20All%20Together>`__. | +| `stitched-together diagram `__. | +-----------------------------------------------------------------------+ Grace-Period Cleanup @@ -596,7 +596,7 @@ maintain ordering. For example, if the callback function wakes up a task that runs on some other CPU, proper ordering must in place in both the callback function and the task being awakened. To see why this is important, consider the top half of the `grace-period -cleanup <#Grace-Period%20Cleanup>`__ diagram. The callback might be +cleanup`_ diagram. The callback might be running on a CPU corresponding to the leftmost leaf ``rcu_node`` structure, and awaken a task that is to run on a CPU corresponding to the rightmost leaf ``rcu_node`` structure, and the grace-period kernel diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index e8c84fcc0507..d4c9a016074b 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -45,7 +45,7 @@ requirements: #. `Other RCU Flavors`_ #. `Possible Future Changes`_ -This is followed by a `summary <#Summary>`__, however, the answers to +This is followed by a summary_, however, the answers to each quick quiz immediately follows the quiz. Select the big white space with your mouse to see the answer. @@ -1096,7 +1096,7 @@ memory barriers. | case, voluntary context switch) within an RCU read-side critical | | section. However, sleeping locks may be used within userspace RCU | | read-side critical sections, and also within Linux-kernel sleepable | -| RCU `(SRCU) <#Sleepable%20RCU>`__ read-side critical sections. In | +| RCU `(SRCU) `__ read-side critical sections. In | | addition, the -rt patchset turns spinlocks into a sleeping locks so | | that the corresponding critical sections can be preempted, which also | | means that these sleeplockified spinlocks (but not other sleeping | @@ -1186,7 +1186,7 @@ non-preemptible (``CONFIG_PREEMPT=n``) kernels, and thus `tiny RCU `__ was born. Josh Triplett has since taken over the small-memory banner with his `Linux kernel tinification `__ -project, which resulted in `SRCU <#Sleepable%20RCU>`__ becoming optional +project, which resulted in `SRCU `__ becoming optional for those kernels not needing it. The remaining performance requirements are, for the most part, @@ -1457,8 +1457,8 @@ will vary as the value of ``HZ`` varies, and can also be changed using the relevant Kconfig options and kernel boot parameters. RCU currently does not do much sanity checking of these parameters, so please use caution when changing them. Note that these forward-progress measures -are provided only for RCU, not for `SRCU <#Sleepable%20RCU>`__ or `Tasks -RCU <#Tasks%20RCU>`__. +are provided only for RCU, not for `SRCU `__ or `Tasks +RCU`_. RCU takes the following steps in ``call_rcu()`` to encourage timely invocation of callbacks when any given non-\ ``rcu_nocbs`` CPU has @@ -1477,8 +1477,8 @@ encouragement was provided: Again, these are default values when running at ``HZ=1000``, and can be overridden. Again, these forward-progress measures are provided only for -RCU, not for `SRCU <#Sleepable%20RCU>`__ or `Tasks -RCU <#Tasks%20RCU>`__. Even for RCU, callback-invocation forward +RCU, not for `SRCU `__ or `Tasks +RCU`_. Even for RCU, callback-invocation forward progress for ``rcu_nocbs`` CPUs is much less well-developed, in part because workloads benefiting from ``rcu_nocbs`` CPUs tend to invoke ``call_rcu()`` relatively infrequently. If workloads emerge that need @@ -1920,7 +1920,7 @@ Hotplug CPU The Linux kernel supports CPU hotplug, which means that CPUs can come and go. It is of course illegal to use any RCU API member from an -offline CPU, with the exception of `SRCU <#Sleepable%20RCU>`__ read-side +offline CPU, with the exception of `SRCU `__ read-side critical sections. This requirement was present from day one in DYNIX/ptx, but on the other hand, the Linux kernel's CPU-hotplug implementation is “interesting.” @@ -2177,7 +2177,7 @@ handles these states differently: However, RCU must be reliably informed as to whether any given CPU is currently in the idle loop, and, for ``NO_HZ_FULL``, also whether that CPU is executing in usermode, as discussed -`earlier <#Energy%20Efficiency>`__. It also requires that the +`earlier `__. It also requires that the scheduling-clock interrupt be enabled when RCU needs it to be: #. If a CPU is either idle or executing in usermode, and RCU believes it @@ -2294,7 +2294,7 @@ Performance, Scalability, Response Time, and Reliability ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Expanding on the `earlier -discussion <#Performance%20and%20Scalability>`__, RCU is used heavily by +discussion `__, RCU is used heavily by hot code paths in performance-critical portions of the Linux kernel's networking, security, virtualization, and scheduling code paths. RCU must therefore use efficient implementations, especially in its diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst index 6ed806e6061b..c3448929a824 100644 --- a/Documentation/kernel-hacking/locking.rst +++ b/Documentation/kernel-hacking/locking.rst @@ -118,11 +118,11 @@ spinlock, but you may block holding a mutex. If you can't lock a mutex, your task will suspend itself, and be woken up when the mutex is released. This means the CPU can do something else while you are waiting. There are many cases when you simply can't sleep (see -`What Functions Are Safe To Call From Interrupts? <#sleeping-things>`__), +`What Functions Are Safe To Call From Interrupts?`_), and so have to use a spinlock instead. Neither type of lock is recursive: see -`Deadlock: Simple and Advanced <#deadlock>`__. +`Deadlock: Simple and Advanced`_. Locks and Uniprocessor Kernels ------------------------------ @@ -179,7 +179,7 @@ perfect world). Note that you can also use spin_lock_irq() or spin_lock_irqsave() here, which stop hardware interrupts -as well: see `Hard IRQ Context <#hard-irq-context>`__. +as well: see `Hard IRQ Context`_. This works perfectly for UP as well: the spin lock vanishes, and this macro simply becomes local_bh_disable() @@ -230,7 +230,7 @@ The Same Softirq ~~~~~~~~~~~~~~~~ The same softirq can run on the other CPUs: you can use a per-CPU array -(see `Per-CPU Data <#per-cpu-data>`__) for better performance. If you're +(see `Per-CPU Data`_) for better performance. If you're going so far as to use a softirq, you probably care about scalable performance enough to justify the extra complexity. diff --git a/Documentation/sound/kernel-api/writing-an-alsa-driver.rst b/Documentation/sound/kernel-api/writing-an-alsa-driver.rst index 73bbd59afc33..e6365836fa8b 100644 --- a/Documentation/sound/kernel-api/writing-an-alsa-driver.rst +++ b/Documentation/sound/kernel-api/writing-an-alsa-driver.rst @@ -71,7 +71,7 @@ core/oss The codes for PCM and mixer OSS emulation modules are stored in this directory. The rawmidi OSS emulation is included in the ALSA rawmidi code since it's quite small. The sequencer code is stored in -``core/seq/oss`` directory (see `below <#core-seq-oss>`__). +``core/seq/oss`` directory (see `below `__). core/seq ~~~~~~~~ @@ -382,7 +382,7 @@ where ``enable[dev]`` is the module option. Each time the ``probe`` callback is called, check the availability of the device. If not available, simply increment the device index and returns. dev will be incremented also later (`step 7 -<#set-the-pci-driver-data-and-return-zero>`__). +<7) Set the PCI driver data and return zero._>`__). 2) Create a card instance ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -450,10 +450,10 @@ field contains the information shown in ``/proc/asound/cards``. 5) Create other components, such as mixer, MIDI, etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Here you define the basic components such as `PCM <#PCM-Interface>`__, -mixer (e.g. `AC97 <#API-for-AC97-Codec>`__), MIDI (e.g. -`MPU-401 <#MIDI-MPU401-UART-Interface>`__), and other interfaces. -Also, if you want a `proc file <#Proc-Interface>`__, define it here, +Here you define the basic components such as `PCM `__, +mixer (e.g. `AC97 `__), MIDI (e.g. +`MPU-401 `__), and other interfaces. +Also, if you want a `proc file `__, define it here, too. 6) Register the card instance. @@ -941,7 +941,7 @@ The allocation of an interrupt source is done like this: chip->irq = pci->irq; where :c:func:`snd_mychip_interrupt()` is the interrupt handler -defined `later <#pcm-interface-interrupt-handler>`__. Note that +defined `later `__. Note that ``chip->irq`` should be defined only when :c:func:`request_irq()` succeeded. @@ -3104,7 +3104,7 @@ processing the output stream in the irq handler. If the MPU-401 interface shares its interrupt with the other logical devices on the card, set ``MPU401_INFO_IRQ_HOOK`` (see -`below <#MIDI-Interrupt-Handler>`__). +`below `__). Usually, the port address corresponds to the command port and port + 1 corresponds to the data port. If not, you may change the ``cport`` From 81e79063004f32aae5196f0c929192e69aca1694 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 26 Dec 2020 09:44:33 -0800 Subject: [PATCH 273/547] Documentation: admin: early_param()s are also listed in kernel-parameters Add info that "early_param()" kernel boot parameters are also listed in kernel-parameters.txt. Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Link: https://lore.kernel.org/r/20201226174433.7885-1-rdunlap@infradead.org Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index 06fb1b4aa849..682ab28b5c94 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -3,8 +3,8 @@ The kernel's command-line parameters ==================================== -The following is a consolidated list of the kernel parameters as -implemented by the __setup(), core_param() and module_param() macros +The following is a consolidated list of the kernel parameters as implemented +by the __setup(), early_param(), core_param() and module_param() macros and sorted into English Dictionary order (defined as ignoring all punctuation and sorting digits before letters in a case insensitive manner), and with descriptions where known. From c7e74b3c7b1cf4c04164ff16e6c047232fd3bcef Mon Sep 17 00:00:00 2001 From: Liao Pingfang Date: Sun, 27 Dec 2020 15:15:19 +0800 Subject: [PATCH 274/547] docs/mm: concepts.rst: Correct the threshold to low watermark It should be "low watermark" where we wake up kswapd daemon. Signed-off-by: Liao Pingfang Link: https://lore.kernel.org/r/1609053319-3112-1-git-send-email-winndows@163.com Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/mm/concepts.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/concepts.rst b/Documentation/admin-guide/mm/concepts.rst index fa0974fbeae7..b966fcff993b 100644 --- a/Documentation/admin-guide/mm/concepts.rst +++ b/Documentation/admin-guide/mm/concepts.rst @@ -184,7 +184,7 @@ pages either asynchronously or synchronously, depending on the state of the system. When the system is not loaded, most of the memory is free and allocation requests will be satisfied immediately from the free pages supply. As the load increases, the amount of the free pages goes -down and when it reaches a certain threshold (high watermark), an +down and when it reaches a certain threshold (low watermark), an allocation request will awaken the ``kswapd`` daemon. It will asynchronously scan memory pages and either just free them if the data they contain is available elsewhere, or evict to the backing storage From 0be1511f516e2b9766597336cedc6dc6d19e5af1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Dec 2020 15:12:12 -0800 Subject: [PATCH 275/547] Documentation: doc-guide: fixes to sphinx.rst Various fixes to sphinx.rst: - eliminate a double-space between 2 words - grammar/wording - punctuation - call rows in a table 'rows' instead of 'columns' (or does Sphinx call everything a column?) - It seems that "amdfonts" should be "amsfonts". I can't find any amdfonts. Signed-off-by: Randy Dunlap Reviewed-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/20201228231212.22448-1-rdunlap@infradead.org Signed-off-by: Jonathan Corbet --- Documentation/doc-guide/sphinx.rst | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Documentation/doc-guide/sphinx.rst b/Documentation/doc-guide/sphinx.rst index 2fb2ff297d69..36ac2166ad67 100644 --- a/Documentation/doc-guide/sphinx.rst +++ b/Documentation/doc-guide/sphinx.rst @@ -48,12 +48,12 @@ or ``virtualenv``, depending on how your distribution packaged Python 3. those versions, you should run ``pip install 'docutils==0.12'``. #) It is recommended to use the RTD theme for html output. Depending - on the Sphinx version, it should be installed in separate, + on the Sphinx version, it should be installed separately, with ``pip install sphinx_rtd_theme``. - #) Some ReST pages contain math expressions. Due to the way Sphinx work, + #) Some ReST pages contain math expressions. Due to the way Sphinx works, those expressions are written using LaTeX notation. It needs texlive - installed with amdfonts and amsmath in order to evaluate them. + installed with amsfonts and amsmath in order to evaluate them. In summary, if you want to install Sphinx version 1.7.9, you should do:: @@ -128,7 +128,7 @@ Sphinx Build ============ The usual way to generate the documentation is to run ``make htmldocs`` or -``make pdfdocs``. There are also other formats available, see the documentation +``make pdfdocs``. There are also other formats available: see the documentation section of ``make help``. The generated documentation is placed in format-specific subdirectories under ``Documentation/output``. @@ -303,17 +303,17 @@ and *targets* (e.g. a ref to ``:ref:`last row ``` / :ref:`last row - head col 3 - head col 4 - * - column 1 + * - row 1 - field 1.1 - field 1.2 with autospan - * - column 2 + * - row 2 - field 2.1 - :rspan:`1` :cspan:`1` field 2.2 - 3.3 * .. _`last row`: - - column 3 + - row 3 Rendered as: @@ -325,17 +325,17 @@ Rendered as: - head col 3 - head col 4 - * - column 1 + * - row 1 - field 1.1 - field 1.2 with autospan - * - column 2 + * - row 2 - field 2.1 - :rspan:`1` :cspan:`1` field 2.2 - 3.3 * .. _`last row`: - - column 3 + - row 3 Cross-referencing ----------------- @@ -361,7 +361,7 @@ Figures & Images If you want to add an image, you should use the ``kernel-figure`` and ``kernel-image`` directives. E.g. to insert a figure with a scalable -image format use SVG (:ref:`svg_image_example`):: +image format, use SVG (:ref:`svg_image_example`):: .. kernel-figure:: svg_image.svg :alt: simple SVG image @@ -375,7 +375,7 @@ image format use SVG (:ref:`svg_image_example`):: SVG image example -The kernel figure (and image) directive support **DOT** formatted files, see +The kernel figure (and image) directive supports **DOT** formatted files, see * DOT: http://graphviz.org/pdf/dotguide.pdf * Graphviz: http://www.graphviz.org/content/dot-language @@ -394,7 +394,7 @@ A simple example (:ref:`hello_dot_file`):: DOT's hello world example -Embed *render* markups (or languages) like Graphviz's **DOT** is provided by the +Embedded *render* markups (or languages) like Graphviz's **DOT** are provided by the ``kernel-render`` directives.:: .. kernel-render:: DOT @@ -406,7 +406,7 @@ Embed *render* markups (or languages) like Graphviz's **DOT** is provided by the } How this will be rendered depends on the installed tools. If Graphviz is -installed, you will see an vector image. If not the raw markup is inserted as +installed, you will see a vector image. If not, the raw markup is inserted as *literal-block* (:ref:`hello_dot_render`). .. _hello_dot_render: @@ -421,8 +421,8 @@ installed, you will see an vector image. If not the raw markup is inserted as The *render* directive has all the options known from the *figure* directive, plus option ``caption``. If ``caption`` has a value, a *figure* node is -inserted. If not, a *image* node is inserted. A ``caption`` is also needed, if -you want to refer it (:ref:`hello_svg_render`). +inserted. If not, an *image* node is inserted. A ``caption`` is also needed, if +you want to refer to it (:ref:`hello_svg_render`). Embedded **SVG**:: From 798ed7800e20dfc3304de1b99df5ac71ad48966b Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Sun, 20 Dec 2020 07:09:27 +0100 Subject: [PATCH 276/547] atomic: remove further references to atomic_ops Commit f0400a77ebdc ("atomic: Delete obsolete documentation") removed ./Documentation/core-api/atomic_ops.rst, but missed to remove further references to that file. Hence, make htmldocs warns: Documentation/core-api/index.rst:53: WARNING: toctree contains reference to nonexisting document 'core-api/atomic_ops' Also, ./scripts/get_maintainer.pl --self-test=patterns warns: warning: no file matches F: Documentation/core-api/atomic_ops.rst Remove further references to ./Documentation/core-api/atomic_ops.rst. Fixes: f0400a77ebdc ("atomic: Delete obsolete documentation") Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20201220060927.21582-1-lukas.bulwahn@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/core-api/index.rst | 1 - MAINTAINERS | 1 - 2 files changed, 2 deletions(-) diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 69171b1799f2..f1c9d20bd42d 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -53,7 +53,6 @@ How Linux keeps everything from happening at the same time. See .. toctree:: :maxdepth: 1 - atomic_ops refcount-vs-atomic irq/index local_ops diff --git a/MAINTAINERS b/MAINTAINERS index 546aa66428c9..ddc8e90c1224 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10260,7 +10260,6 @@ S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev F: Documentation/atomic_bitops.txt F: Documentation/atomic_t.txt -F: Documentation/core-api/atomic_ops.rst F: Documentation/core-api/refcount-vs-atomic.rst F: Documentation/litmus-tests/ F: Documentation/memory-barriers.txt From 3d5c5fdcee0f9a94deb0472e594706018b00aa31 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 1 Jan 2021 09:38:52 +0100 Subject: [PATCH 277/547] ALSA: hda/hdmi: Fix incorrect mutex unlock in silent_stream_disable() The silent_stream_disable() function introduced by the commit b1a5039759cb ("ALSA: hda/hdmi: fix silent stream for first playback to DP") takes the per_pin->lock mutex, but it unlocks the wrong one, spec->pcm_lock, which causes a deadlock. This patch corrects it. Fixes: b1a5039759cb ("ALSA: hda/hdmi: fix silent stream for first playback to DP") Reported-by: Jan Alexander Steffens (heftig) Cc: Acked-by: Kai Vehmanen Link: https://lore.kernel.org/r/20210101083852.12094-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_hdmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 1e4a4b83fbf6..74d246a0dc6d 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -1733,7 +1733,7 @@ static void silent_stream_disable(struct hda_codec *codec, per_pin->silent_stream = false; unlock_out: - mutex_unlock(&spec->pcm_lock); + mutex_unlock(&per_pin->lock); } /* update ELD and jack state via audio component */ From cedd1862be7e666be87ec824dabc6a2b05618f36 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 28 Dec 2020 11:40:22 -0800 Subject: [PATCH 278/547] depmod: handle the case of /sbin/depmod without /sbin in PATH Commit 436e980e2ed5 ("kbuild: don't hardcode depmod path") stopped hard-coding the path of depmod, but in the process caused trouble for distributions that had that /sbin location, but didn't have it in the PATH (generally because /sbin is limited to the super-user path). Work around it for now by just adding /sbin to the end of PATH in the depmod.sh script. Reported-and-tested-by: Sedat Dilek Signed-off-by: Linus Torvalds --- scripts/depmod.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/depmod.sh b/scripts/depmod.sh index e083bcae343f..3643b4f896ed 100755 --- a/scripts/depmod.sh +++ b/scripts/depmod.sh @@ -15,6 +15,8 @@ if ! test -r System.map ; then exit 0 fi +# legacy behavior: "depmod" in /sbin, no /sbin in PATH +PATH="$PATH:/sbin" if [ -z $(command -v $DEPMOD) ]; then echo "Warning: 'make modules_install' requires $DEPMOD. Please install it." >&2 echo "This is probably in the kmod package." >&2 From fd16931a2f518a32753920ff20895e5cf04c8ff1 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 13 Dec 2020 15:39:29 +0100 Subject: [PATCH 279/547] crypto: arm/chacha-neon - add missing counter increment Commit 86cd97ec4b943af3 ("crypto: arm/chacha-neon - optimize for non-block size multiples") refactored the chacha block handling in the glue code in a way that may result in the counter increment to be omitted when calling chacha_block_xor_neon() to process a full block. This violates the skcipher API, which requires that the output IV is suitable for handling more input as long as the preceding input has been presented in round multiples of the block size. Also, the same code is exposed via the chacha library interface whose callers may actually rely on this increment to occur even for final blocks that are smaller than the chacha block size. So increment the counter after calling chacha_block_xor_neon(). Fixes: 86cd97ec4b943af3 ("crypto: arm/chacha-neon - optimize for non-block size multiples") Reported-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/chacha-glue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/crypto/chacha-glue.c b/arch/arm/crypto/chacha-glue.c index 7b5cf8430c6d..cdde8fd01f8f 100644 --- a/arch/arm/crypto/chacha-glue.c +++ b/arch/arm/crypto/chacha-glue.c @@ -60,6 +60,7 @@ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, chacha_block_xor_neon(state, d, s, nrounds); if (d != dst) memcpy(dst, buf, bytes); + state[12]++; } } From 0aa171e9b267ce7c52d3a3df7bc9c1fc0203dec5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 2 Jan 2021 14:59:09 +0100 Subject: [PATCH 280/547] crypto: ecdh - avoid buffer overflow in ecdh_set_secret() Pavel reports that commit 17858b140bf4 ("crypto: ecdh - avoid unaligned accesses in ecdh_set_secret()") fixes one problem but introduces another: the unconditional memcpy() introduced by that commit may overflow the target buffer if the source data is invalid, which could be the result of intentional tampering. So check params.key_size explicitly against the size of the target buffer before validating the key further. Fixes: 17858b140bf4 ("crypto: ecdh - avoid unaligned accesses in ecdh_set_secret()") Reported-by: Pavel Machek Cc: Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/ecdh.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/ecdh.c b/crypto/ecdh.c index d56b8603dec9..96f80c8f8e30 100644 --- a/crypto/ecdh.c +++ b/crypto/ecdh.c @@ -39,7 +39,8 @@ static int ecdh_set_secret(struct crypto_kpp *tfm, const void *buf, struct ecdh params; unsigned int ndigits; - if (crypto_ecdh_decode_key(buf, len, ¶ms) < 0) + if (crypto_ecdh_decode_key(buf, len, ¶ms) < 0 || + params.key_size > sizeof(ctx->private_key)) return -EINVAL; ndigits = ecdh_supported_curve(params.curve_id); From 04901aab40ea3779f6fc6383ef74d8e130e817bf Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 30 Dec 2020 21:24:18 -0800 Subject: [PATCH 281/547] bpf: Fix a task_iter bug caused by a merge conflict resolution Latest bpf tree has a bug for bpf_iter selftest: $ ./test_progs -n 4/25 test_bpf_sk_storage_get:PASS:bpf_iter_bpf_sk_storage_helpers__open_and_load 0 nsec test_bpf_sk_storage_get:PASS:socket 0 nsec ... do_dummy_read:PASS:read 0 nsec test_bpf_sk_storage_get:FAIL:bpf_map_lookup_elem map value wasn't set correctly (expected 1792, got -1, err=0) #4/25 bpf_sk_storage_get:FAIL #4 bpf_iter:FAIL Summary: 0/0 PASSED, 0 SKIPPED, 2 FAILED When doing merge conflict resolution, Commit 4bfc4714849d missed to save curr_task to seq_file private data. The task pointer in seq_file private data is passed to bpf program. This caused NULL-pointer task passed to bpf program which will immediately return upon checking whether task pointer is NULL. This patch added back the assignment of curr_task to seq_file private data and fixed the issue. Fixes: 4bfc4714849d ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20201231052418.577024-1-yhs@fb.com --- kernel/bpf/task_iter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 3efe38191d1c..175b7b42bfc4 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -159,6 +159,7 @@ again: } /* set info->task and info->tid */ + info->task = curr_task; if (curr_tid == info->tid) { curr_fd = info->fd; } else { From b0e1306302018d876472ed074c1bfaa8020bf9df Mon Sep 17 00:00:00 2001 From: Timon Reinold Date: Sat, 2 Jan 2021 22:08:35 +0100 Subject: [PATCH 282/547] ALSA: usb-audio: Add quirk for RC-505 BOSS RC-505 (shown by lsusb as "Roland Corp. RC-505") does require the same quirk as these other BOSS devices. Without this quirk it is neither possible to capture audio from nor to write audio to the RC-505. Both just result in an empty audio stream. With these changes both capture and playback seem to work quite fine. MIDI funtionality was not tested. Tested-by: Harry Reinold Signed-off-by: Timon Reinold Link: https://lore.kernel.org/r/20210102210835.21268-1-tirei@agon.one Signed-off-by: Takashi Iwai --- sound/usb/implicit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/implicit.c b/sound/usb/implicit.c index 70b9777b2e63..931042a6a051 100644 --- a/sound/usb/implicit.c +++ b/sound/usb/implicit.c @@ -74,6 +74,7 @@ static const struct snd_usb_implicit_fb_match playback_implicit_fb_quirks[] = { /* No quirk for playback but with capture quirk (see below) */ IMPLICIT_FB_SKIP_DEV(0x0582, 0x0130), /* BOSS BR-80 */ + IMPLICIT_FB_SKIP_DEV(0x0582, 0x0171), /* BOSS RC-505 */ IMPLICIT_FB_SKIP_DEV(0x0582, 0x0189), /* BOSS GT-100v2 */ IMPLICIT_FB_SKIP_DEV(0x0582, 0x01d6), /* BOSS GT-1 */ IMPLICIT_FB_SKIP_DEV(0x0582, 0x01d8), /* BOSS Katana */ @@ -86,6 +87,7 @@ static const struct snd_usb_implicit_fb_match playback_implicit_fb_quirks[] = { /* Implicit feedback quirk table for capture: only FIXED type */ static const struct snd_usb_implicit_fb_match capture_implicit_fb_quirks[] = { IMPLICIT_FB_FIXED_DEV(0x0582, 0x0130, 0x0d, 0x01), /* BOSS BR-80 */ + IMPLICIT_FB_FIXED_DEV(0x0582, 0x0171, 0x0d, 0x01), /* BOSS RC-505 */ IMPLICIT_FB_FIXED_DEV(0x0582, 0x0189, 0x0d, 0x01), /* BOSS GT-100v2 */ IMPLICIT_FB_FIXED_DEV(0x0582, 0x01d6, 0x0d, 0x01), /* BOSS GT-1 */ IMPLICIT_FB_FIXED_DEV(0x0582, 0x01d8, 0x0d, 0x01), /* BOSS Katana */ From 36a106a4c1c100d55ba3d32a21ef748cfcd4fa99 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 22:42:39 +0100 Subject: [PATCH 283/547] block: rsxx: select CONFIG_CRC32 Without crc32, the driver fails to link: arm-linux-gnueabi-ld: drivers/block/rsxx/config.o: in function `rsxx_load_config': config.c:(.text+0x124): undefined reference to `crc32_le' Fixes: 8722ff8cdbfa ("block: IBM RamSan 70/80 device driver") Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 262326973ee0..583b671b1d2d 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -445,6 +445,7 @@ config BLK_DEV_RBD config BLK_DEV_RSXX tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver" depends on PCI + select CRC32 help Device driver for IBM's high speed PCIe SSD storage device: Flash Adapter 900GB Full Height. From 19cd3403cb0d522dd5e10188eef85817de29e26e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 22:43:09 +0100 Subject: [PATCH 284/547] lightnvm: select CONFIG_CRC32 Without CRC32 support, this fails to link: arm-linux-gnueabi-ld: drivers/lightnvm/pblk-init.o: in function `pblk_init': pblk-init.c:(.text+0x2654): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/lightnvm/pblk-init.o: in function `pblk_exit': pblk-init.c:(.text+0x2a7c): undefined reference to `crc32_le' Fixes: a4bd217b4326 ("lightnvm: physical block device (pblk) target") Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- drivers/lightnvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 8f39f9ba5c80..4c2ce210c123 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -19,6 +19,7 @@ if NVM config NVM_PBLK tristate "Physical Block Device Open-Channel SSD target" + select CRC32 help Allows an open-channel SSD to be exposed as a block device to the host. The target assumes the device exposes raw flash and must be From e71ba9452f0b5b2e8dc8aa5445198cd9214a6a62 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 3 Jan 2021 15:55:30 -0800 Subject: [PATCH 285/547] Linux 5.11-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3d328b7ab200..8b2c3f88ee5e 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Kleptomaniac Octopus # *DOCUMENTATION* From 4f8b848788f77c7f5c3bd98febce66b7aa14785f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 22:43:52 +0100 Subject: [PATCH 286/547] zonefs: select CONFIG_CRC32 When CRC32 is disabled, zonefs cannot be linked: ld: fs/zonefs/super.o: in function `zonefs_fill_super': Add a Kconfig 'select' statement for it. Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system") Signed-off-by: Arnd Bergmann Signed-off-by: Damien Le Moal --- fs/zonefs/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/zonefs/Kconfig b/fs/zonefs/Kconfig index ef2697b78820..827278f937fe 100644 --- a/fs/zonefs/Kconfig +++ b/fs/zonefs/Kconfig @@ -3,6 +3,7 @@ config ZONEFS_FS depends on BLOCK depends on BLK_DEV_ZONED select FS_IOMAP + select CRC32 help zonefs is a simple file system which exposes zones of a zoned block device (e.g. host-managed or host-aware SMR disk drives) as files. From 5136bb8c8b5872676f397b27f93a30568baf3a25 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Sat, 19 Dec 2020 17:24:56 +0100 Subject: [PATCH 287/547] MAINTAINERS: adjust GCC PLUGINS after gcc-plugin.sh removal Commit 1e860048c53e ("gcc-plugins: simplify GCC plugin-dev capability test") removed ./scripts/gcc-plugin.sh, but missed to adjust MAINTAINERS. Hence, ./scripts/get_maintainers.pl --self-test=patterns warns: warning: no file matches F: scripts/gcc-plugin.sh Adjust entries in GGC PLUGINS section after this file removal. Signed-off-by: Lukas Bulwahn Signed-off-by: Masahiro Yamada --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6eff4f720c72..181c32c18aaf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7363,7 +7363,6 @@ L: linux-hardening@vger.kernel.org S: Maintained F: Documentation/kbuild/gcc-plugins.rst F: scripts/Makefile.gcc-plugins -F: scripts/gcc-plugin.sh F: scripts/gcc-plugins/ GCOV BASED KERNEL PROFILING From d39648eb67ac851c7918c794424c266a5d2635b9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 19 Dec 2020 09:08:05 -0800 Subject: [PATCH 288/547] kconfig: config script: add a little user help Give the user a clue about the problem along with the 35 lines of usage/help text. Signed-off-by: Randy Dunlap Cc: Andi Kleen Cc: Masahiro Yamada Cc: linux-kbuild@vger.kernel.org Signed-off-by: Masahiro Yamada --- scripts/config | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/config b/scripts/config index 8c8d7c3d7acc..ff88e2faefd3 100755 --- a/scripts/config +++ b/scripts/config @@ -223,6 +223,7 @@ while [ "$1" != "" ] ; do ;; *) + echo "bad command: $CMD" >&2 usage ;; esac From c0f975af1745391749e4306aa8081b9a4d2cced8 Mon Sep 17 00:00:00 2001 From: John Millikin Date: Wed, 23 Dec 2020 14:04:23 +0900 Subject: [PATCH 289/547] kconfig: Support building mconf with vendor sysroot ncurses Changes the final fallback path in the ncurses locator for mconf to support host compilers with a non-default sysroot. This is similar to the hardcoded search for ncurses under '/usr/include', but can support compilers that keep their default header and library directories elsewhere. For nconfig, do nothing because the only vendor compiler I'm aware of with this layout (Apple Clang) ships an ncurses version that's too old for nconfig anyway. Signed-off-by: John Millikin Signed-off-by: Masahiro Yamada --- scripts/kconfig/mconf-cfg.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/kconfig/mconf-cfg.sh b/scripts/kconfig/mconf-cfg.sh index aa68ec95620d..fcd4acd4e9cb 100755 --- a/scripts/kconfig/mconf-cfg.sh +++ b/scripts/kconfig/mconf-cfg.sh @@ -33,7 +33,9 @@ if [ -f /usr/include/ncurses/ncurses.h ]; then exit 0 fi -if [ -f /usr/include/ncurses.h ]; then +# As a final fallback before giving up, check if $HOSTCC knows of a default +# ncurses installation (e.g. from a vendor-specific sysroot). +if echo '#include ' | "${HOSTCC}" -E - >/dev/null 2>&1; then echo cflags=\"-D_GNU_SOURCE\" echo libs=\"-lncurses\" exit 0 From 0c36d88cff4d72149f94809303c5180b6f716d39 Mon Sep 17 00:00:00 2001 From: John Millikin Date: Wed, 23 Dec 2020 15:23:25 +0900 Subject: [PATCH 290/547] lib/raid6: Let $(UNROLL) rules work with macOS userland Older versions of BSD awk are fussy about the order of '-v' and '-f' flags, and require a space after the flag name. This causes build failures on platforms with an old awk, such as macOS and NetBSD. Since GNU awk and modern versions of BSD awk (distributed with FreeBSD/OpenBSD) are fine with either form, the definition of 'cmd_unroll' can be trivially tweaked to let the lib/raid6 Makefile work with both old and new awk flag dialects. Signed-off-by: John Millikin Signed-off-by: Masahiro Yamada --- lib/raid6/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index b4c0df6d706d..c770570bfe4f 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -48,7 +48,7 @@ endif endif quiet_cmd_unroll = UNROLL $@ - cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$* < $< > $@ + cmd_unroll = $(AWK) -v N=$* -f $(srctree)/$(src)/unroll.awk < $< > $@ targets += int1.c int2.c int4.c int8.c int16.c int32.c $(obj)/int%.c: $(src)/int.uc $(src)/unroll.awk FORCE From 9bba03d4473df0b707224d4d2067b62d1e1e2a77 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 23 Dec 2020 15:35:42 +0900 Subject: [PATCH 291/547] kconfig: remove 'kvmconfig' and 'xenconfig' shorthands Linux 5.10 is out. Remove the 'kvmconfig' and 'xenconfig' shorthands as previously announced. Signed-off-by: Masahiro Yamada --- scripts/kconfig/Makefile | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index e46df0a2d4f9..2c40e68853dd 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -94,16 +94,6 @@ configfiles=$(wildcard $(srctree)/kernel/configs/$@ $(srctree)/arch/$(SRCARCH)/c $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m .config $(configfiles) $(Q)$(MAKE) -f $(srctree)/Makefile olddefconfig -PHONY += kvmconfig -kvmconfig: kvm_guest.config - @echo >&2 "WARNING: 'make $@' will be removed after Linux 5.10" - @echo >&2 " Please use 'make $<' instead." - -PHONY += xenconfig -xenconfig: xen.config - @echo >&2 "WARNING: 'make $@' will be removed after Linux 5.10" - @echo >&2 " Please use 'make $<' instead." - PHONY += tinyconfig tinyconfig: $(Q)$(MAKE) -f $(srctree)/Makefile allnoconfig tiny.config From d6c1ddd938d84a1adef7e19e8efc10e1b4df5034 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 30 Dec 2020 16:25:34 +0100 Subject: [PATCH 292/547] USB: serial: option: add Quectel EM160R-GL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New modem using ff/ff/30 for QCDM, ff/00/00 for AT and NMEA, and ff/ff/ff for RMNET/QMI. T: Bus=02 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs= 1 P: Vendor=2c7c ProdID=0620 Rev= 4.09 S: Manufacturer=Quectel S: Product=EM160R-GL S: SerialNumber=e31cedc1 C:* #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=896mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=(none) E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms Cc: stable@vger.kernel.org Signed-off-by: Bjørn Mork [ johan: add model comment ] Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 2c21e34235bb..ee726a106706 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1117,6 +1117,8 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM12, 0xff, 0xff, 0xff), .driver_info = RSVD(1) | RSVD(2) | RSVD(3) | RSVD(4) | NUMEP2 }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EM12, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, 0x0620, 0xff, 0xff, 0x30) }, /* EM160R-GL */ + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, 0x0620, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0xff, 0x10), From 42e85f90171a4ba59a1e1cedbbc30ce3f68f2317 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 15 Dec 2020 11:30:26 +0100 Subject: [PATCH 293/547] arm64/smp: Remove unused irq variable in arch_show_interrupts() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/arm64/kernel/smp.c: In function ‘arch_show_interrupts’: arch/arm64/kernel/smp.c:808:16: warning: unused variable ‘irq’ [-Wunused-variable] 808 | unsigned int irq = irq_desc_get_irq(ipi_desc[i]); | ^~~ The removal of the last user forgot to remove the variable. Fixes: 5089bc51f81f ("arm64/smp: Use irq_desc_kstat_cpu() in arch_show_interrupts()") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20201215103026.2872532-1-geert+renesas@glider.be Signed-off-by: Catalin Marinas --- arch/arm64/kernel/smp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 6bc3a3698c3d..376343d6f13a 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -807,7 +807,6 @@ int arch_show_interrupts(struct seq_file *p, int prec) unsigned int cpu, i; for (i = 0; i < NR_IPI; i++) { - unsigned int irq = irq_desc_get_irq(ipi_desc[i]); seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i, prec >= 4 ? " " : ""); for_each_online_cpu(cpu) From b614231dec7864a338ce85032aa3d2d7ea2bc46d Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Wed, 2 Dec 2020 23:34:58 -0800 Subject: [PATCH 294/547] arm64: mte: remove an ISB on kernel exit This ISB is unnecessary because we will soon do an ERET. Signed-off-by: Peter Collingbourne Link: https://linux-review.googlesource.com/id/I69f1ee6bb09b1372dd744a0e01cedaf090c8d448 Link: https://lore.kernel.org/r/20201203073458.2675400-1-pcc@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2a93fa5f4e49..a8c3e7aaca74 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -182,7 +182,6 @@ alternative_else_nop_endif mrs_s \tmp2, SYS_GCR_EL1 bfi \tmp2, \tmp, #0, #16 msr_s SYS_GCR_EL1, \tmp2 - isb #endif .endm @@ -194,6 +193,7 @@ alternative_else_nop_endif ldr_l \tmp, gcr_kernel_excl mte_set_gcr \tmp, \tmp2 + isb 1: #endif .endm From 095507dc1350b3a2b8b39fdc05edba0c10859eca Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Fri, 18 Dec 2020 17:33:07 +0100 Subject: [PATCH 295/547] arm64: mm: Fix ARCH_LOW_ADDRESS_LIMIT when !CONFIG_ZONE_DMA Systems configured with CONFIG_ZONE_DMA32, CONFIG_ZONE_NORMAL and !CONFIG_ZONE_DMA will fail to properly setup ARCH_LOW_ADDRESS_LIMIT. The limit will default to ~0ULL, effectively spanning the whole memory, which is too high for a configuration that expects low memory to be capped at 4GB. Fix ARCH_LOW_ADDRESS_LIMIT by falling back to arm64_dma32_phys_limit when arm64_dma_phys_limit isn't set. arm64_dma32_phys_limit will honour CONFIG_ZONE_DMA32, or span the entire memory when not enabled. Fixes: 1a8e1cef7603 ("arm64: use both ZONE_DMA and ZONE_DMA32") Signed-off-by: Nicolas Saenz Julienne Link: https://lore.kernel.org/r/20201218163307.10150-1-nsaenzjulienne@suse.de Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/processor.h | 3 ++- arch/arm64/mm/init.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index ca2cd75d3286..69ad25fbeae4 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -94,7 +94,8 @@ #endif /* CONFIG_ARM64_FORCE_52BIT */ extern phys_addr_t arm64_dma_phys_limit; -#define ARCH_LOW_ADDRESS_LIMIT (arm64_dma_phys_limit - 1) +extern phys_addr_t arm64_dma32_phys_limit; +#define ARCH_LOW_ADDRESS_LIMIT ((arm64_dma_phys_limit ? : arm64_dma32_phys_limit) - 1) struct debug_info { #ifdef CONFIG_HAVE_HW_BREAKPOINT diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 75addb36354a..7deddf56f7c3 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -59,7 +59,7 @@ EXPORT_SYMBOL(memstart_addr); * bit addressable memory area. */ phys_addr_t arm64_dma_phys_limit __ro_after_init; -static phys_addr_t arm64_dma32_phys_limit __ro_after_init; +phys_addr_t arm64_dma32_phys_limit __ro_after_init; #ifdef CONFIG_KEXEC_CORE /* From 26982a89cad77c0efc1c0c79bee0e3d75e9281d4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 21 Dec 2020 22:37:58 +0000 Subject: [PATCH 296/547] afs: Work around strnlen() oops with CONFIG_FORTIFIED_SOURCE=y AFS has a structured layout in its directory contents (AFS dirs are downloaded as files and parsed locally by the client for lookup/readdir). The slots in the directory are defined by union afs_xdr_dirent. This, however, only directly allows a name of a length that will fit into that union. To support a longer name, the next 1-8 contiguous entries are annexed to the first one and the name flows across these. afs_dir_iterate_block() uses strnlen(), limited to the space to the end of the page, to find out how long the name is. This worked fine until 6a39e62abbaf. With that commit, the compiler determines the size of the array and asserts that the string fits inside that array. This is a problem for AFS because we *expect* it to overflow one or more arrays. A similar problem also occurs in afs_dir_scan_block() when a directory file is being locally edited to avoid the need to redownload it. There strlen() was being used safely because each page has the last byte set to 0 when the file is downloaded and validated (in afs_dir_check_page()). Fix this by changing the afs_xdr_dirent union name field to an indeterminate-length array and dropping the overflow field. (Note that whilst looking at this, I realised that the calculation of the number of slots a dirent used is non-standard and not quite right, but I'll address that in a separate patch.) The issue can be triggered by something like: touch /afs/example.com/thisisaveryveryverylongname and it generates a report that looks like: detected buffer overflow in strnlen ------------[ cut here ]------------ kernel BUG at lib/string.c:1149! ... RIP: 0010:fortify_panic+0xf/0x11 ... Call Trace: afs_dir_iterate_block+0x12b/0x35b afs_dir_iterate+0x14e/0x1ce afs_do_lookup+0x131/0x417 afs_lookup+0x24f/0x344 lookup_open.isra.0+0x1bb/0x27d open_last_lookups+0x166/0x237 path_openat+0xe0/0x159 do_filp_open+0x48/0xa4 ? kmem_cache_alloc+0xf5/0x16e ? __clear_close_on_exec+0x13/0x22 ? _raw_spin_unlock+0xa/0xb do_sys_openat2+0x72/0xde do_sys_open+0x3b/0x58 do_syscall_64+0x2d/0x3a entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 6a39e62abbaf ("lib: string.h: detect intra-object overflow in fortified string functions") Reported-by: Marc Dionne Signed-off-by: David Howells Tested-by: Marc Dionne cc: Daniel Axtens --- fs/afs/xdr_fs.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/afs/xdr_fs.h b/fs/afs/xdr_fs.h index 94f1f398eefa..c926430fd08a 100644 --- a/fs/afs/xdr_fs.h +++ b/fs/afs/xdr_fs.h @@ -54,10 +54,15 @@ union afs_xdr_dirent { __be16 hash_next; __be32 vnode; __be32 unique; - u8 name[16]; - u8 overflow[4]; /* if any char of the name (inc - * NUL) reaches here, consume - * the next dirent too */ + u8 name[]; + /* When determining the number of dirent slots needed to + * represent a directory entry, name should be assumed to be 16 + * bytes, due to a now-standardised (mis)calculation, but it is + * in fact 20 bytes in size. + * + * For names longer than (16 or) 20 bytes, extra slots should + * be annexed to this one using the extended_name format. + */ } u; u8 extended_name[32]; } __packed; From 366911cd762db02c2dd32fad1be96b72a66f205d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 23 Dec 2020 10:39:57 +0000 Subject: [PATCH 297/547] afs: Fix directory entry size calculation The number of dirent records used by an AFS directory entry should be calculated using the assumption that there is a 16-byte name field in the first block, rather than a 20-byte name field (which is actually the case). This miscalculation is historic and effectively standard, so we have to use it. The calculation we need to use is: 1 + (((strlen(name) + 1) + 15) >> 5) where we are adding one to the strlen() result to account for the NUL termination. Fix this by the following means: (1) Create an inline function to do the calculation for a given name length. (2) Use the function to calculate the number of records used for a dirent in afs_dir_iterate_block(). Use this to move the over-end check out of the loop since it only needs to be done once. Further use this to only go through the loop for the 2nd+ records composing an entry. The only test there now is for if the record is allocated - and we already checked the first block at the top of the outer loop. (3) Add a max name length check in afs_dir_iterate_block(). (4) Make afs_edit_dir_add() and afs_edit_dir_remove() use the function from (1) to calculate the number of blocks rather than doing it incorrectly themselves. Fixes: 63a4681ff39c ("afs: Locally edit directory data for mkdir/create/unlink/...") Fixes: ^1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: David Howells Tested-by: Marc Dionne --- fs/afs/dir.c | 49 ++++++++++++++++++++------------------ fs/afs/dir_edit.c | 6 ++--- fs/afs/xdr_fs.h | 14 ++++++++++- include/trace/events/afs.h | 2 ++ 4 files changed, 43 insertions(+), 28 deletions(-) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 9068d5578a26..7bd659ad959e 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -350,7 +350,7 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, unsigned blkoff) { union afs_xdr_dirent *dire; - unsigned offset, next, curr; + unsigned offset, next, curr, nr_slots; size_t nlen; int tmp; @@ -363,13 +363,12 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, offset < AFS_DIR_SLOTS_PER_BLOCK; offset = next ) { - next = offset + 1; - /* skip entries marked unused in the bitmap */ if (!(block->hdr.bitmap[offset / 8] & (1 << (offset % 8)))) { _debug("ENT[%zu.%u]: unused", blkoff / sizeof(union afs_xdr_dir_block), offset); + next = offset + 1; if (offset >= curr) ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); @@ -381,35 +380,39 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, nlen = strnlen(dire->u.name, sizeof(*block) - offset * sizeof(union afs_xdr_dirent)); + if (nlen > AFSNAMEMAX - 1) { + _debug("ENT[%zu]: name too long (len %u/%zu)", + blkoff / sizeof(union afs_xdr_dir_block), + offset, nlen); + return afs_bad(dvnode, afs_file_error_dir_name_too_long); + } _debug("ENT[%zu.%u]: %s %zu \"%s\"", blkoff / sizeof(union afs_xdr_dir_block), offset, (offset < curr ? "skip" : "fill"), nlen, dire->u.name); - /* work out where the next possible entry is */ - for (tmp = nlen; tmp > 15; tmp -= sizeof(union afs_xdr_dirent)) { - if (next >= AFS_DIR_SLOTS_PER_BLOCK) { - _debug("ENT[%zu.%u]:" - " %u travelled beyond end dir block" - " (len %u/%zu)", + nr_slots = afs_dir_calc_slots(nlen); + next = offset + nr_slots; + if (next > AFS_DIR_SLOTS_PER_BLOCK) { + _debug("ENT[%zu.%u]:" + " %u extends beyond end dir block" + " (len %zu)", + blkoff / sizeof(union afs_xdr_dir_block), + offset, next, nlen); + return afs_bad(dvnode, afs_file_error_dir_over_end); + } + + /* Check that the name-extension dirents are all allocated */ + for (tmp = 1; tmp < nr_slots; tmp++) { + unsigned int ix = offset + tmp; + if (!(block->hdr.bitmap[ix / 8] & (1 << (ix % 8)))) { + _debug("ENT[%zu.u]:" + " %u unmarked extension (%u/%u)", blkoff / sizeof(union afs_xdr_dir_block), - offset, next, tmp, nlen); - return afs_bad(dvnode, afs_file_error_dir_over_end); - } - if (!(block->hdr.bitmap[next / 8] & - (1 << (next % 8)))) { - _debug("ENT[%zu.%u]:" - " %u unmarked extension (len %u/%zu)", - blkoff / sizeof(union afs_xdr_dir_block), - offset, next, tmp, nlen); + offset, tmp, nr_slots); return afs_bad(dvnode, afs_file_error_dir_unmarked_ext); } - - _debug("ENT[%zu.%u]: ext %u/%zu", - blkoff / sizeof(union afs_xdr_dir_block), - next, tmp, nlen); - next++; } /* skip if starts before the current position */ diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 2ffe09abae7f..f4600c1353ad 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -215,8 +215,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode, } /* Work out how many slots we're going to need. */ - need_slots = round_up(12 + name->len + 1 + 4, AFS_DIR_DIRENT_SIZE); - need_slots /= AFS_DIR_DIRENT_SIZE; + need_slots = afs_dir_calc_slots(name->len); meta_page = kmap(page0); meta = &meta_page->blocks[0]; @@ -393,8 +392,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, } /* Work out how many slots we're going to discard. */ - need_slots = round_up(12 + name->len + 1 + 4, AFS_DIR_DIRENT_SIZE); - need_slots /= AFS_DIR_DIRENT_SIZE; + need_slots = afs_dir_calc_slots(name->len); meta_page = kmap(page0); meta = &meta_page->blocks[0]; diff --git a/fs/afs/xdr_fs.h b/fs/afs/xdr_fs.h index c926430fd08a..8ca868164507 100644 --- a/fs/afs/xdr_fs.h +++ b/fs/afs/xdr_fs.h @@ -58,7 +58,8 @@ union afs_xdr_dirent { /* When determining the number of dirent slots needed to * represent a directory entry, name should be assumed to be 16 * bytes, due to a now-standardised (mis)calculation, but it is - * in fact 20 bytes in size. + * in fact 20 bytes in size. afs_dir_calc_slots() should be + * used for this. * * For names longer than (16 or) 20 bytes, extra slots should * be annexed to this one using the extended_name format. @@ -101,4 +102,15 @@ struct afs_xdr_dir_page { union afs_xdr_dir_block blocks[AFS_DIR_BLOCKS_PER_PAGE]; }; +/* + * Calculate the number of dirent slots required for any given name length. + * The calculation is made assuming the part of the name in the first slot is + * 16 bytes, rather than 20, but this miscalculation is now standardised. + */ +static inline unsigned int afs_dir_calc_slots(size_t name_len) +{ + name_len++; /* NUL-terminated */ + return 1 + ((name_len + 15) / AFS_DIR_DIRENT_SIZE); +} + #endif /* XDR_FS_H */ diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 4eef374d4413..4a5cc8c64be3 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -231,6 +231,7 @@ enum afs_file_error { afs_file_error_dir_bad_magic, afs_file_error_dir_big, afs_file_error_dir_missing_page, + afs_file_error_dir_name_too_long, afs_file_error_dir_over_end, afs_file_error_dir_small, afs_file_error_dir_unmarked_ext, @@ -488,6 +489,7 @@ enum afs_cb_break_reason { EM(afs_file_error_dir_bad_magic, "DIR_BAD_MAGIC") \ EM(afs_file_error_dir_big, "DIR_BIG") \ EM(afs_file_error_dir_missing_page, "DIR_MISSING_PAGE") \ + EM(afs_file_error_dir_name_too_long, "DIR_NAME_TOO_LONG") \ EM(afs_file_error_dir_over_end, "DIR_ENT_OVER_END") \ EM(afs_file_error_dir_small, "DIR_SMALL") \ EM(afs_file_error_dir_unmarked_ext, "DIR_UNMARKED_EXT") \ From 0bd1bf86ab79555425b9f0b63005e181defe4da6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 14:57:29 +0100 Subject: [PATCH 298/547] dmaengine: qcom: fix gpi undefined behavior gcc points out an incorrect error handling loop: drivers/dma/qcom/gpi.c: In function 'gpi_ch_init': drivers/dma/qcom/gpi.c:1254:15: error: iteration 2 invokes undefined behavior [-Werror=aggressive-loop-optimizations] 1254 | struct gpii *gpii = gchan->gpii; | ^~~~ drivers/dma/qcom/gpi.c:1951:2: note: within this loop 1951 | for (i = i - 1; i >= 0; i++) { | ^~~ Change the loop to correctly walk backwards through the initialized fields rather than off into the woods. Fixes: 5d0c3533a19f ("dmaengine: qcom: Add GPI dma driver") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210103135738.3741123-1-arnd@kernel.org Signed-off-by: Vinod Koul --- drivers/dma/qcom/gpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c index 556c070a514c..1a0bf6b0567a 100644 --- a/drivers/dma/qcom/gpi.c +++ b/drivers/dma/qcom/gpi.c @@ -1948,7 +1948,7 @@ static int gpi_ch_init(struct gchan *gchan) return ret; error_start_chan: - for (i = i - 1; i >= 0; i++) { + for (i = i - 1; i >= 0; i--) { gpi_stop_chan(&gpii->gchan[i]); gpi_send_cmd(gpii, gchan, GPI_CH_CMD_RESET); } From 99974aedbd73523969afb09f33c6e3047cd0ddae Mon Sep 17 00:00:00 2001 From: Shravya Kumbham Date: Wed, 23 Dec 2020 16:51:00 +0530 Subject: [PATCH 299/547] dmaengine: xilinx_dma: check dma_async_device_register return value dma_async_device_register() can return non-zero error code. Add condition to check the return value of dma_async_device_register function and handle the error path. Addresses-Coverity: Event check_return. Fixes: 9cd4360de609 ("dma: Add Xilinx AXI Video Direct Memory Access Engine driver support") Signed-off-by: Shravya Kumbham Signed-off-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/1608722462-29519-2-git-send-email-radhey.shyam.pandey@xilinx.com Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xilinx_dma.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c index 22faea653ea8..091f1f80dcff 100644 --- a/drivers/dma/xilinx/xilinx_dma.c +++ b/drivers/dma/xilinx/xilinx_dma.c @@ -3112,7 +3112,11 @@ static int xilinx_dma_probe(struct platform_device *pdev) } /* Register the DMA engine with the core */ - dma_async_device_register(&xdev->common); + err = dma_async_device_register(&xdev->common); + if (err) { + dev_err(xdev->dev, "failed to register the dma device\n"); + goto error; + } err = of_dma_controller_register(node, of_dma_xilinx_xlate, xdev); From faeb0731be0a31e2246b21a85fa7dabbd750101d Mon Sep 17 00:00:00 2001 From: Shravya Kumbham Date: Wed, 23 Dec 2020 16:51:01 +0530 Subject: [PATCH 300/547] dmaengine: xilinx_dma: fix incompatible param warning in _child_probe() In xilinx_dma_child_probe function, the nr_channels variable is passed to of_property_read_u32() which expects an u32 return value pointer. Modify the nr_channels variable type from int to u32 to fix the incompatible parameter coverity warning. Addresses-Coverity: Event incompatible_param. Fixes: 1a9e7a03c761 ("dmaengine: vdma: Add support for mulit-channel dma mode") Signed-off-by: Shravya Kumbham Signed-off-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/1608722462-29519-3-git-send-email-radhey.shyam.pandey@xilinx.com Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xilinx_dma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c index 091f1f80dcff..3e374e8d2244 100644 --- a/drivers/dma/xilinx/xilinx_dma.c +++ b/drivers/dma/xilinx/xilinx_dma.c @@ -2900,7 +2900,8 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device *xdev, static int xilinx_dma_child_probe(struct xilinx_dma_device *xdev, struct device_node *node) { - int ret, i, nr_channels = 1; + int ret, i; + u32 nr_channels = 1; ret = of_property_read_u32(node, "dma-channels", &nr_channels); if (xdev->dma_config->dmatype == XDMA_TYPE_AXIMCDMA && ret < 0) From 2d5efea64472469117dc1a9a39530069e95b21e9 Mon Sep 17 00:00:00 2001 From: Shravya Kumbham Date: Wed, 23 Dec 2020 16:51:02 +0530 Subject: [PATCH 301/547] dmaengine: xilinx_dma: fix mixed_enum_type coverity warning Typecast the fls(width -1) with (enum dmaengine_alignment) in xilinx_dma_chan_probe function to fix the coverity warning. Addresses-Coverity: Event mixed_enum_type. Fixes: 9cd4360de609 ("dma: Add Xilinx AXI Video Direct Memory Access Engine driver support") Signed-off-by: Shravya Kumbham Signed-off-by: Radhey Shyam Pandey Link: https://lore.kernel.org/r/1608722462-29519-4-git-send-email-radhey.shyam.pandey@xilinx.com Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xilinx_dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c index 3e374e8d2244..79777550a6ff 100644 --- a/drivers/dma/xilinx/xilinx_dma.c +++ b/drivers/dma/xilinx/xilinx_dma.c @@ -2781,7 +2781,7 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device *xdev, has_dre = false; if (!has_dre) - xdev->common.copy_align = fls(width - 1); + xdev->common.copy_align = (enum dmaengine_alignment)fls(width - 1); if (of_device_is_compatible(node, "xlnx,axi-vdma-mm2s-channel") || of_device_is_compatible(node, "xlnx,axi-dma-mm2s-channel") || From 98bf2d3f4970179c702ef64db658e0553bc6ef3a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 22 Dec 2020 07:11:18 +0000 Subject: [PATCH 302/547] powerpc/32s: Fix RTAS machine check with VMAP stack When we have VMAP stack, exception prolog 1 sets r1, not r11. When it is not an RTAS machine check, don't trash r1 because it is needed by prolog 1. Fixes: da7bb43ab9da ("powerpc/32: Fix vmap stack - Properly set r1 before activating MMU") Fixes: d2e006036082 ("powerpc/32: Use SPRN_SPRG_SCRATCH2 in exception prologs") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Christophe Leroy [mpe: Squash in fixup for RTAS machine check from Christophe] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/bc77d61d1c18940e456a2dee464f1e2eda65a3f0.1608621048.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_book3s_32.S | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 349bf3f0c3af..858fbc8b19f3 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -260,10 +260,19 @@ __secondary_hold_acknowledge: MachineCheck: EXCEPTION_PROLOG_0 #ifdef CONFIG_PPC_CHRP +#ifdef CONFIG_VMAP_STACK + mtspr SPRN_SPRG_SCRATCH2,r1 + mfspr r1, SPRN_SPRG_THREAD + lwz r1, RTAS_SP(r1) + cmpwi cr1, r1, 0 + bne cr1, 7f + mfspr r1, SPRN_SPRG_SCRATCH2 +#else mfspr r11, SPRN_SPRG_THREAD lwz r11, RTAS_SP(r11) cmpwi cr1, r11, 0 bne cr1, 7f +#endif #endif /* CONFIG_PPC_CHRP */ EXCEPTION_PROLOG_1 for_rtas=1 7: EXCEPTION_PROLOG_2 From 6170d077bf92c5b3dfbe1021688d3c0404f7c9e9 Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Mon, 4 Jan 2021 09:29:09 +0800 Subject: [PATCH 303/547] spi: fix the divide by 0 error when calculating xfer waiting time The xfer waiting time is the result of xfer->len / xfer->speed_hz. This patch makes the assumption of 100khz xfer speed if the xfer->speed_hz is not assigned and stays 0. This avoids the divide by 0 issue and ensures a reasonable tolerant waiting time. Signed-off-by: Xu Yilun Link: https://lore.kernel.org/r/1609723749-3557-1-git-send-email-yilun.xu@intel.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index f59bf5094adb..720ab34784c1 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1108,6 +1108,7 @@ static int spi_transfer_wait(struct spi_controller *ctlr, { struct spi_statistics *statm = &ctlr->statistics; struct spi_statistics *stats = &msg->spi->statistics; + u32 speed_hz = xfer->speed_hz; unsigned long long ms; if (spi_controller_is_slave(ctlr)) { @@ -1116,8 +1117,11 @@ static int spi_transfer_wait(struct spi_controller *ctlr, return -EINTR; } } else { + if (!speed_hz) + speed_hz = 100000; + ms = 8LL * 1000LL * xfer->len; - do_div(ms, xfer->speed_hz); + do_div(ms, speed_hz); ms += ms + 200; /* some tolerance */ if (ms > UINT_MAX) From 2bf3a72b08e7f6356a2db9e1571ca65f683510bb Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 31 Dec 2020 15:23:45 +0300 Subject: [PATCH 304/547] dt-bindings: regulator: qcom,rpmh-regulator: add pm8009 revision PMIC pm8009 has special revision (P=1) made for sm8250 platform. The major difference is the S2 regulator which supplies 0.95 V instead of 2.848V. Add special compatibility string for this chip revision. The datasheet calls the chip just pm8009-1, so use the same name. Signed-off-by: Dmitry Baryshkov Reviewed-by: Vinod Koul Link: https://lore.kernel.org/r/20201231122348.637917-2-dmitry.baryshkov@linaro.org Signed-off-by: Mark Brown --- .../devicetree/bindings/regulator/qcom,rpmh-regulator.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt index b8f0b7809c02..7d462b899473 100644 --- a/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt +++ b/Documentation/devicetree/bindings/regulator/qcom,rpmh-regulator.txt @@ -44,6 +44,7 @@ First Level Nodes - PMIC Definition: Must be one of below: "qcom,pm8005-rpmh-regulators" "qcom,pm8009-rpmh-regulators" + "qcom,pm8009-1-rpmh-regulators" "qcom,pm8150-rpmh-regulators" "qcom,pm8150l-rpmh-regulators" "qcom,pm8350-rpmh-regulators" From df6b92fa40050e59ea89784294bf6d04c0c47705 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 31 Dec 2020 15:23:46 +0300 Subject: [PATCH 305/547] regulator: qcom-rpmh-regulator: correct hfsmps515 definition According to the datasheet pm8009's HFS515 regulators have 16mV resolution rather than declared 1.6 mV. Correct the resolution. Signed-off-by: Dmitry Baryshkov Fixes: 06369bcc15a1 ("regulator: qcom-rpmh: Add support for SM8150") Reviewed-by: Vinod Koul Link: https://lore.kernel.org/r/20201231122348.637917-3-dmitry.baryshkov@linaro.org Signed-off-by: Mark Brown --- drivers/regulator/qcom-rpmh-regulator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/regulator/qcom-rpmh-regulator.c b/drivers/regulator/qcom-rpmh-regulator.c index fe030ec4b7db..c395a8dda6f7 100644 --- a/drivers/regulator/qcom-rpmh-regulator.c +++ b/drivers/regulator/qcom-rpmh-regulator.c @@ -726,7 +726,7 @@ static const struct rpmh_vreg_hw_data pmic5_ftsmps510 = { static const struct rpmh_vreg_hw_data pmic5_hfsmps515 = { .regulator_type = VRM, .ops = &rpmh_regulator_vrm_ops, - .voltage_range = REGULATOR_LINEAR_RANGE(2800000, 0, 4, 1600), + .voltage_range = REGULATOR_LINEAR_RANGE(2800000, 0, 4, 16000), .n_voltages = 5, .pmic_mode_map = pmic_mode_map_pmic5_smps, .of_map_mode = rpmh_regulator_pmic4_smps_of_map_mode, From d957d1610c661e758426654de3b04bea6fb29f8b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 Dec 2020 15:56:23 +0100 Subject: [PATCH 306/547] regulator: qcom-rpmh: add QCOM_COMMAND_DB dependency A built-in regulator driver cannot link against a modular cmd_db driver: qcom-rpmh-regulator.c:(.text+0x174): undefined reference to `cmd_db_read_addr' There is already a dependency for RPMh, so add another one of this type for cmd_db. Fixes: 34c5aa2666db ("regulator: Kconfig: Fix REGULATOR_QCOM_RPMH dependencies to avoid build error") Fixes: 46fc033eba42 ("regulator: add QCOM RPMh regulator driver") Signed-off-by: Arnd Bergmann Reviewed-by: Bjorn Andersson Link: https://lore.kernel.org/r/20201230145712.3133110-1-arnd@kernel.org Signed-off-by: Mark Brown --- drivers/regulator/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index 53fa84f4d1e1..5abdd29fb9f3 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -881,6 +881,7 @@ config REGULATOR_QCOM_RPM config REGULATOR_QCOM_RPMH tristate "Qualcomm Technologies, Inc. RPMh regulator driver" depends on QCOM_RPMH || (QCOM_RPMH=n && COMPILE_TEST) + depends on QCOM_COMMAND_DB || (QCOM_COMMAND_DB=n && COMPILE_TEST) help This driver supports control of PMIC regulators via the RPMh hardware block found on Qualcomm Technologies Inc. SoCs. RPMh regulator From 4b1a60a1bb8f03d82c3f6da424adc96667b59f2a Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Mon, 4 Jan 2021 15:50:43 +0200 Subject: [PATCH 307/547] MAINTAINERS: Update Georgi's email address Use my kernel.org email as main address to make things a bit easier for me to handle. Link: https://lore.kernel.org/r/20210104135043.31262-1-georgi.djakov@linaro.org Signed-off-by: Georgi Djakov --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 546aa66428c9..a15e306123ef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9240,7 +9240,7 @@ F: tools/testing/selftests/sgx/* K: \bSGX_ INTERCONNECT API -M: Georgi Djakov +M: Georgi Djakov L: linux-pm@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/interconnect/ From 0e2d6795e8dbe91c2f5473564c6b25d11df3778b Mon Sep 17 00:00:00 2001 From: Daniel Palmer Date: Sun, 27 Dec 2020 12:17:16 +0900 Subject: [PATCH 308/547] USB: serial: option: add LongSung M5710 module support Add a device-id entry for the LongSung M5710 module. T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=2df3 ProdID=9d03 Rev= 1.00 S: Manufacturer=Marvell S: Product=Mobile Composite Device Bus S: SerialNumber= C:* #Ifs= 5 Cfg#= 1 Atr=c0 MxPwr=500mA A: FirstIf#= 0 IfCount= 2 Cls=e0(wlcon) Sub=01 Prot=03 I:* If#= 0 Alt= 0 #EPs= 1 Cls=e0(wlcon) Sub=01 Prot=03 Driver=rndis_host E: Ad=87(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=0a(data ) Sub=00 Prot=00 Driver=rndis_host E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0c(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0b(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=89(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms E: Ad=86(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 5 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=88(I) Atr=03(Int.) MxPS= 64 Ivl=4096ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=0a(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Daniel Palmer https://lore.kernel.org/r/20201227031716.1343300-1-daniel@0x0f.com [ johan: drop id defines, only bind to vendor class ] Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index ee726a106706..3fe959104311 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2059,6 +2059,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x0105, 0xff), /* Fibocom NL678 series */ .driver_info = RSVD(6) }, { USB_DEVICE_INTERFACE_CLASS(0x2cb7, 0x01a0, 0xff) }, /* Fibocom NL668-AM/NL652-EU (laptop MBIM) */ + { USB_DEVICE_INTERFACE_CLASS(0x2df3, 0x9d03, 0xff) }, /* LongSung M5710 */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1404, 0xff) }, /* GosunCn GM500 RNDIS */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1405, 0xff) }, /* GosunCn GM500 MBIM */ { USB_DEVICE_INTERFACE_CLASS(0x305a, 0x1406, 0xff) }, /* GosunCn GM500 ECM/NCM */ From 54d0a3ab80f49f19ee916def62fe067596833403 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 4 Jan 2021 15:50:07 +0100 Subject: [PATCH 309/547] USB: serial: iuu_phoenix: fix DMA from stack Stack-allocated buffers cannot be used for DMA (on all architectures) so allocate the flush command buffer using kmalloc(). Fixes: 60a8fc017103 ("USB: add iuu_phoenix driver") Cc: stable # 2.6.25 Reviewed-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold --- drivers/usb/serial/iuu_phoenix.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c index f1201d4de297..e8f06b41a503 100644 --- a/drivers/usb/serial/iuu_phoenix.c +++ b/drivers/usb/serial/iuu_phoenix.c @@ -532,23 +532,29 @@ static int iuu_uart_flush(struct usb_serial_port *port) struct device *dev = &port->dev; int i; int status; - u8 rxcmd = IUU_UART_RX; + u8 *rxcmd; struct iuu_private *priv = usb_get_serial_port_data(port); if (iuu_led(port, 0xF000, 0, 0, 0xFF) < 0) return -EIO; + rxcmd = kmalloc(1, GFP_KERNEL); + if (!rxcmd) + return -ENOMEM; + + rxcmd[0] = IUU_UART_RX; + for (i = 0; i < 2; i++) { - status = bulk_immediate(port, &rxcmd, 1); + status = bulk_immediate(port, rxcmd, 1); if (status != IUU_OPERATION_OK) { dev_dbg(dev, "%s - uart_flush_write error\n", __func__); - return status; + goto out_free; } status = read_immediate(port, &priv->len, 1); if (status != IUU_OPERATION_OK) { dev_dbg(dev, "%s - uart_flush_read error\n", __func__); - return status; + goto out_free; } if (priv->len > 0) { @@ -556,12 +562,16 @@ static int iuu_uart_flush(struct usb_serial_port *port) status = read_immediate(port, priv->buf, priv->len); if (status != IUU_OPERATION_OK) { dev_dbg(dev, "%s - uart_flush_read error\n", __func__); - return status; + goto out_free; } } } dev_dbg(dev, "%s - uart_flush_read OK!\n", __func__); iuu_led(port, 0, 0xF000, 0, 0xFF); + +out_free: + kfree(rxcmd); + return status; } From 4bfd6247fa9164c8e193a55ef9c0ea3ee22f82d8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 4 Jan 2021 16:30:46 +0100 Subject: [PATCH 310/547] ALSA: hda/via: Fix runtime PM for Clevo W35xSS Clevo W35xSS_370SS with VIA codec has had the runtime PM problem that looses the power state of some nodes after the runtime resume. This was worked around by disabling the default runtime PM via a denylist entry. Since 5.10.x made the runtime PM applied (casually) even though it's disabled in the denylist, this problem was revisited. The result was that disabling power_save_node feature suffices for the runtime PM problem. This patch implements the disablement of power_save_node feature in VIA codec for the device. It also drops the former denylist entry, too, as the runtime PM should work in the codec side properly now. Fixes: b529ef2464ad ("ALSA: hda: Add Clevo W35xSS_370SS to the power_save blacklist") Reported-by: Christian Labisch Cc: Link: https://lore.kernel.org/r/20210104153046.19993-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 2 -- sound/pci/hda/patch_via.c | 13 +++++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 6852668f1bcb..770ad25f1907 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2220,8 +2220,6 @@ static const struct snd_pci_quirk power_save_denylist[] = { SND_PCI_QUIRK(0x1849, 0x7662, "Asrock H81M-HDS", 0), /* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */ SND_PCI_QUIRK(0x1043, 0x8733, "Asus Prime X370-Pro", 0), - /* https://bugzilla.redhat.com/show_bug.cgi?id=1581607 */ - SND_PCI_QUIRK(0x1558, 0x3501, "Clevo W35xSS_370SS", 0), /* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */ SND_PCI_QUIRK(0x1558, 0x6504, "Clevo W65_67SB", 0), /* https://bugzilla.redhat.com/show_bug.cgi?id=1525104 */ diff --git a/sound/pci/hda/patch_via.c b/sound/pci/hda/patch_via.c index 7ef8f3105cdb..0ab40a8a68fb 100644 --- a/sound/pci/hda/patch_via.c +++ b/sound/pci/hda/patch_via.c @@ -1002,6 +1002,7 @@ static const struct hda_verb vt1802_init_verbs[] = { enum { VIA_FIXUP_INTMIC_BOOST, VIA_FIXUP_ASUS_G75, + VIA_FIXUP_POWER_SAVE, }; static void via_fixup_intmic_boost(struct hda_codec *codec, @@ -1011,6 +1012,13 @@ static void via_fixup_intmic_boost(struct hda_codec *codec, override_mic_boost(codec, 0x30, 0, 2, 40); } +static void via_fixup_power_save(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + if (action == HDA_FIXUP_ACT_PRE_PROBE) + codec->power_save_node = 0; +} + static const struct hda_fixup via_fixups[] = { [VIA_FIXUP_INTMIC_BOOST] = { .type = HDA_FIXUP_FUNC, @@ -1025,11 +1033,16 @@ static const struct hda_fixup via_fixups[] = { { } } }, + [VIA_FIXUP_POWER_SAVE] = { + .type = HDA_FIXUP_FUNC, + .v.func = via_fixup_power_save, + }, }; static const struct snd_pci_quirk vt2002p_fixups[] = { SND_PCI_QUIRK(0x1043, 0x1487, "Asus G75", VIA_FIXUP_ASUS_G75), SND_PCI_QUIRK(0x1043, 0x8532, "Asus X202E", VIA_FIXUP_INTMIC_BOOST), + SND_PCI_QUIRK(0x1558, 0x3501, "Clevo W35xSS_370SS", VIA_FIXUP_POWER_SAVE), {} }; From 020a1f453449294926ca548d8d5ca970926e8dfd Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 4 Jan 2021 15:53:02 +0100 Subject: [PATCH 311/547] USB: usblp: fix DMA to stack Stack-allocated buffers cannot be used for DMA (on all architectures). Replace the HP-channel macro with a helper function that allocates a dedicated transfer buffer so that it can continue to be used with arguments from the stack. Note that the buffer is cleared on allocation as usblp_ctrl_msg() returns success also on short transfers (the buffer is only used for debugging). Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210104145302.2087-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/usblp.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c index 67cbd42421be..134dc2005ce9 100644 --- a/drivers/usb/class/usblp.c +++ b/drivers/usb/class/usblp.c @@ -274,8 +274,25 @@ static int usblp_ctrl_msg(struct usblp *usblp, int request, int type, int dir, i #define usblp_reset(usblp)\ usblp_ctrl_msg(usblp, USBLP_REQ_RESET, USB_TYPE_CLASS, USB_DIR_OUT, USB_RECIP_OTHER, 0, NULL, 0) -#define usblp_hp_channel_change_request(usblp, channel, buffer) \ - usblp_ctrl_msg(usblp, USBLP_REQ_HP_CHANNEL_CHANGE_REQUEST, USB_TYPE_VENDOR, USB_DIR_IN, USB_RECIP_INTERFACE, channel, buffer, 1) +static int usblp_hp_channel_change_request(struct usblp *usblp, int channel, u8 *new_channel) +{ + u8 *buf; + int ret; + + buf = kzalloc(1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ret = usblp_ctrl_msg(usblp, USBLP_REQ_HP_CHANNEL_CHANGE_REQUEST, + USB_TYPE_VENDOR, USB_DIR_IN, USB_RECIP_INTERFACE, + channel, buf, 1); + if (ret == 0) + *new_channel = buf[0]; + + kfree(buf); + + return ret; +} /* * See the description for usblp_select_alts() below for the usage From 718bf42b119de652ebcc93655a1f33a9c0d04b3c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 28 Dec 2020 23:13:09 -0800 Subject: [PATCH 312/547] usb: usbip: vhci_hcd: protect shift size Fix shift out-of-bounds in vhci_hcd.c: UBSAN: shift-out-of-bounds in ../drivers/usb/usbip/vhci_hcd.c:399:41 shift exponent 768 is too large for 32-bit type 'int' Fixes: 03cd00d538a6 ("usbip: vhci-hcd: Set the vhci structure up to work") Signed-off-by: Randy Dunlap Reported-by: syzbot+297d20e437b79283bf6d@syzkaller.appspotmail.com Cc: Yuyang Du Cc: Shuah Khan Cc: Greg Kroah-Hartman Cc: linux-usb@vger.kernel.org Cc: stable Link: https://lore.kernel.org/r/20201229071309.18418-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/usbip/vhci_hcd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index 66cde5e5f796..3209b5ddd30c 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -396,6 +396,8 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, default: usbip_dbg_vhci_rh(" ClearPortFeature: default %x\n", wValue); + if (wValue >= 32) + goto error; vhci_hcd->port_status[rhport] &= ~(1 << wValue); break; } From a1383b3537a7bea1c213baa7878ccc4ecf4413b5 Mon Sep 17 00:00:00 2001 From: Wesley Cheng Date: Tue, 29 Dec 2020 15:00:37 -0800 Subject: [PATCH 313/547] usb: dwc3: gadget: Restart DWC3 gadget when enabling pullup usb_gadget_deactivate/usb_gadget_activate does not execute the UDC start operation, which may leave EP0 disabled and event IRQs disabled when re-activating the function. Move the enabling/disabling of USB EP0 and device event IRQs to be performed in the pullup routine. Fixes: ae7e86108b12 ("usb: dwc3: Stop active transfers before halting the controller") Tested-by: Michael Tretter Cc: stable Reported-by: Michael Tretter Signed-off-by: Wesley Cheng Link: https://lore.kernel.org/r/1609282837-21666-1-git-send-email-wcheng@codeaurora.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 78cb4db8a6e4..25f654b79e48 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2083,6 +2083,7 @@ static int dwc3_gadget_run_stop(struct dwc3 *dwc, int is_on, int suspend) static void dwc3_gadget_disable_irq(struct dwc3 *dwc); static void __dwc3_gadget_stop(struct dwc3 *dwc); +static int __dwc3_gadget_start(struct dwc3 *dwc); static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on) { @@ -2145,6 +2146,8 @@ static int dwc3_gadget_pullup(struct usb_gadget *g, int is_on) dwc->ev_buf->lpos = (dwc->ev_buf->lpos + count) % dwc->ev_buf->length; } + } else { + __dwc3_gadget_start(dwc); } ret = dwc3_gadget_run_stop(dwc, is_on, false); @@ -2319,10 +2322,6 @@ static int dwc3_gadget_start(struct usb_gadget *g, } dwc->gadget_driver = driver; - - if (pm_runtime_active(dwc->dev)) - __dwc3_gadget_start(dwc); - spin_unlock_irqrestore(&dwc->lock, flags); return 0; @@ -2348,13 +2347,6 @@ static int dwc3_gadget_stop(struct usb_gadget *g) unsigned long flags; spin_lock_irqsave(&dwc->lock, flags); - - if (pm_runtime_suspended(dwc->dev)) - goto out; - - __dwc3_gadget_stop(dwc); - -out: dwc->gadget_driver = NULL; spin_unlock_irqrestore(&dwc->lock, flags); From 64e6bbfff52db4bf6785fab9cffab850b2de6870 Mon Sep 17 00:00:00 2001 From: Eddie Hung Date: Tue, 29 Dec 2020 18:53:35 +0800 Subject: [PATCH 314/547] usb: gadget: configfs: Fix use-after-free issue with udc_name There is a use-after-free issue, if access udc_name in function gadget_dev_desc_UDC_store after another context free udc_name in function unregister_gadget. Context 1: gadget_dev_desc_UDC_store()->unregister_gadget()-> free udc_name->set udc_name to NULL Context 2: gadget_dev_desc_UDC_show()-> access udc_name Call trace: dump_backtrace+0x0/0x340 show_stack+0x14/0x1c dump_stack+0xe4/0x134 print_address_description+0x78/0x478 __kasan_report+0x270/0x2ec kasan_report+0x10/0x18 __asan_report_load1_noabort+0x18/0x20 string+0xf4/0x138 vsnprintf+0x428/0x14d0 sprintf+0xe4/0x12c gadget_dev_desc_UDC_show+0x54/0x64 configfs_read_file+0x210/0x3a0 __vfs_read+0xf0/0x49c vfs_read+0x130/0x2b4 SyS_read+0x114/0x208 el0_svc_naked+0x34/0x38 Add mutex_lock to protect this kind of scenario. Signed-off-by: Eddie Hung Signed-off-by: Macpaul Lin Reviewed-by: Peter Chen Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1609239215-21819-1-git-send-email-macpaul.lin@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/configfs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 56051bb97349..d9743f4b56c3 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -221,9 +221,16 @@ static ssize_t gadget_dev_desc_bcdUSB_store(struct config_item *item, static ssize_t gadget_dev_desc_UDC_show(struct config_item *item, char *page) { - char *udc_name = to_gadget_info(item)->composite.gadget_driver.udc_name; + struct gadget_info *gi = to_gadget_info(item); + char *udc_name; + int ret; - return sprintf(page, "%s\n", udc_name ?: ""); + mutex_lock(&gi->lock); + udc_name = gi->composite.gadget_driver.udc_name; + ret = sprintf(page, "%s\n", udc_name ?: ""); + mutex_unlock(&gi->lock); + + return ret; } static int unregister_gadget(struct gadget_info *gi) From 7043e311a57625467b6fdb032dec8a6dea878208 Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Wed, 30 Dec 2020 13:11:14 +0800 Subject: [PATCH 315/547] usb: gadget: core: change the comment for usb_gadget_connect The pullup does not need to be enabled at below situations: - For platforms which the hardware pullup starts after setting register even they do not see the VBUS. If the pullup is always enabled for these platforms, it will consume more power and break the USB IF compliance tests [1]. - For platforms which need to do BC 1.2 charger detection after seeing the VBUS. Pullup D+ will break the charger detection flow. - For platforms which the system suspend is allowed when the connection with host is there but the bus is not at suspend. For these platforms, it is better to disable pullup when entering the system suspend otherwise the host may confuse the device behavior after controller is in low power mode. Disable pullup is considered as a disconnection event from host side. [1] USB-IF Full and Low Speed Compliance Test Procedure F Back-voltage Testing Section 7.2.1 of the USB specification requires that no device shall supply (source) current on VBUS at its upstream facing port at any time. From VBUS on its upstream facing port, a device may only draw (sink) current. They may not provide power to the pull-up resistor on D+/D- unless VBUS is present (see Section 7.1.5). Signed-off-by: Peter Chen Link: https://lore.kernel.org/r/20201230051114.21370-1-peter.chen@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index 5b5cfeb6c14a..6a62bbd01324 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -659,8 +659,7 @@ EXPORT_SYMBOL_GPL(usb_gadget_vbus_disconnect); * * Enables the D+ (or potentially D-) pullup. The host will start * enumerating this gadget when the pullup is active and a VBUS session - * is active (the link is powered). This pullup is always enabled unless - * usb_gadget_disconnect() has been used to disable it. + * is active (the link is powered). * * Returns zero on success, else negative errno. */ From d7889c2020e08caab0d7e36e947f642d91015bd0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 22:42:17 +0100 Subject: [PATCH 316/547] usb: gadget: select CONFIG_CRC32 Without crc32 support, this driver fails to link: arm-linux-gnueabi-ld: drivers/usb/gadget/function/f_eem.o: in function `eem_unwrap': f_eem.c:(.text+0x11cc): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/usb/gadget/function/f_ncm.o:f_ncm.c:(.text+0x1e40): more undefined references to `crc32_le' follow Fixes: 6d3865f9d41f ("usb: gadget: NCM: Add transmit multi-frame.") Signed-off-by: Arnd Bergmann Cc: stable Link: https://lore.kernel.org/r/20210103214224.1996535-1-arnd@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig index 7e47e6223089..2d152571a7de 100644 --- a/drivers/usb/gadget/Kconfig +++ b/drivers/usb/gadget/Kconfig @@ -265,6 +265,7 @@ config USB_CONFIGFS_NCM depends on NET select USB_U_ETHER select USB_F_NCM + select CRC32 help NCM is an advanced protocol for Ethernet encapsulation, allows grouping of several ethernet frames into one USB transfer and @@ -314,6 +315,7 @@ config USB_CONFIGFS_EEM depends on NET select USB_U_ETHER select USB_F_EEM + select CRC32 help CDC EEM is a newer USB standard that is somewhat simpler than CDC ECM and therefore can be supported by more hardware. Technically ECM and From 6cd0fe91387917be48e91385a572a69dfac2f3f7 Mon Sep 17 00:00:00 2001 From: Chandana Kishori Chiluveru Date: Tue, 29 Dec 2020 14:44:43 -0800 Subject: [PATCH 317/547] usb: gadget: configfs: Preserve function ordering after bind failure When binding the ConfigFS gadget to a UDC, the functions in each configuration are added in list order. However, if usb_add_function() fails, the failed function is put back on its configuration's func_list and purge_configs_funcs() is called to further clean up. purge_configs_funcs() iterates over the configurations and functions in forward order, calling unbind() on each of the previously added functions. But after doing so, each function gets moved to the tail of the configuration's func_list. This results in reshuffling the original order of the functions within a configuration such that the failed function now appears first even though it may have originally appeared in the middle or even end of the list. At this point if the ConfigFS gadget is attempted to re-bind to the UDC, the functions will be added in a different order than intended, with the only recourse being to remove and relink the functions all over again. An example of this as follows: ln -s functions/mass_storage.0 configs/c.1 ln -s functions/ncm.0 configs/c.1 ln -s functions/ffs.adb configs/c.1 # oops, forgot to start adbd echo "" > UDC # fails start adbd echo "" > UDC # now succeeds, but... # bind order is # "ADB", mass_storage, ncm [30133.118289] configfs-gadget gadget: adding 'Mass Storage Function'/ffffff810af87200 to config 'c'/ffffff817d6a2520 [30133.119875] configfs-gadget gadget: adding 'cdc_network'/ffffff80f48d1a00 to config 'c'/ffffff817d6a2520 [30133.119974] using random self ethernet address [30133.120002] using random host ethernet address [30133.139604] usb0: HOST MAC 3e:27:46:ba:3e:26 [30133.140015] usb0: MAC 6e:28:7e:42:66:6a [30133.140062] configfs-gadget gadget: adding 'Function FS Gadget'/ffffff80f3868438 to config 'c'/ffffff817d6a2520 [30133.140081] configfs-gadget gadget: adding 'Function FS Gadget'/ffffff80f3868438 --> -19 [30133.140098] configfs-gadget gadget: unbind function 'Mass Storage Function'/ffffff810af87200 [30133.140119] configfs-gadget gadget: unbind function 'cdc_network'/ffffff80f48d1a00 [30133.173201] configfs-gadget a600000.dwc3: failed to start g1: -19 [30136.661933] init: starting service 'adbd'... [30136.700126] read descriptors [30136.700413] read strings [30138.574484] configfs-gadget gadget: adding 'Function FS Gadget'/ffffff80f3868438 to config 'c'/ffffff817d6a2520 [30138.575497] configfs-gadget gadget: adding 'Mass Storage Function'/ffffff810af87200 to config 'c'/ffffff817d6a2520 [30138.575554] configfs-gadget gadget: adding 'cdc_network'/ffffff80f48d1a00 to config 'c'/ffffff817d6a2520 [30138.575631] using random self ethernet address [30138.575660] using random host ethernet address [30138.595338] usb0: HOST MAC 2e:cf:43:cd:ca:c8 [30138.597160] usb0: MAC 6a:f0:9f:ee:82:a0 [30138.791490] configfs-gadget gadget: super-speed config #1: c Fix this by reversing the iteration order of the functions in purge_config_funcs() when unbinding them, and adding them back to the config's func_list at the head instead of the tail. This ensures that we unbind and unwind back to the original list order. Fixes: 88af8bbe4ef7 ("usb: gadget: the start of the configfs interface") Signed-off-by: Chandana Kishori Chiluveru Signed-off-by: Jack Pham Reviewed-by: Peter Chen Link: https://lore.kernel.org/r/20201229224443.31623-1-jackp@codeaurora.org Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/configfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index d9743f4b56c3..408a5332a975 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -1255,9 +1255,9 @@ static void purge_configs_funcs(struct gadget_info *gi) cfg = container_of(c, struct config_usb_cfg, c); - list_for_each_entry_safe(f, tmp, &c->functions, list) { + list_for_each_entry_safe_reverse(f, tmp, &c->functions, list) { - list_move_tail(&f->list, &cfg->func_list); + list_move(&f->list, &cfg->func_list); if (f->unbind) { dev_dbg(&gi->cdev.gadget->dev, "unbind function '%s'/%p\n", From e1263f9277bad198c2acc8092a41aea1edbea0e4 Mon Sep 17 00:00:00 2001 From: Amelie Delaunay Date: Mon, 4 Jan 2021 15:20:45 +0100 Subject: [PATCH 318/547] dmaengine: stm32-mdma: fix STM32_MDMA_VERY_HIGH_PRIORITY value STM32_MDMA_VERY_HIGH_PRIORITY is b11 not 0x11, so fix it with 0x3. Signed-off-by: Amelie Delaunay Link: https://lore.kernel.org/r/20210104142045.25583-1-amelie.delaunay@foss.st.com Signed-off-by: Vinod Koul --- drivers/dma/stm32-mdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/stm32-mdma.c b/drivers/dma/stm32-mdma.c index e4637ec786d3..36ba8b43e78d 100644 --- a/drivers/dma/stm32-mdma.c +++ b/drivers/dma/stm32-mdma.c @@ -199,7 +199,7 @@ #define STM32_MDMA_MAX_CHANNELS 63 #define STM32_MDMA_MAX_REQUESTS 256 #define STM32_MDMA_MAX_BURST 128 -#define STM32_MDMA_VERY_HIGH_PRIORITY 0x11 +#define STM32_MDMA_VERY_HIGH_PRIORITY 0x3 enum stm32_mdma_trigger_mode { STM32_MDMA_BUFFER, From 65a4e5299739abe0888cda0938d21f8ea3b5c606 Mon Sep 17 00:00:00 2001 From: David Gow Date: Mon, 21 Dec 2020 23:39:00 -0800 Subject: [PATCH 319/547] kunit: tool: Force the use of the 'tty' console for UML kunit_tool relies on the UML console outputting printk() output to the tty in order to get results. Since the default console driver could change, pass 'console=tty' to the kernel. This is triggered by a change[1] to use ttynull as a fallback console driver which -- by chance or by design -- seems to have changed the default console output on UML, breaking kunit_tool. While this may be fixed, we should be less fragile to such changes in the default. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=757055ae8dedf5333af17b3b5b4b70ba9bc9da4e Signed-off-by: David Gow Fixes: 757055ae8ded ("init/console: Use ttynull as a fallback when there is no console") Reported-by: Andy Shevchenko Tested-by: Andy Shevchenko Acked-by: Brendan Higgins Signed-off-by: Shuah Khan --- tools/testing/kunit/kunit_kernel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py index 57c1724b7e5d..698358c9c0d6 100644 --- a/tools/testing/kunit/kunit_kernel.py +++ b/tools/testing/kunit/kunit_kernel.py @@ -198,7 +198,7 @@ class LinuxSourceTree(object): return self.validate_config(build_dir) def run_kernel(self, args=[], build_dir='', timeout=None): - args.extend(['mem=1G']) + args.extend(['mem=1G', 'console=tty']) self._ops.linux_bin(args, timeout, build_dir) outfile = get_outfile_path(build_dir) subprocess.call(['stty', 'sane']) From 3b4cf848dad5dad4bf239ba664c809c8cf29f1ed Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 17 Dec 2020 17:31:40 +0100 Subject: [PATCH 320/547] selftests/vDSO: add additional binaries to .gitignore Add the test binaries introduced by commit 693f5ca08ca0 ("kselftest: Extend vDSO selftest"), commit 03f55c7952c9 ("kselftest: Extend vDSO selftest to clock_getres") and commit c7e5789b24d3 ("kselftest: Move test_vdso to the vDSO test suite") to .gitignore. Signed-off-by: Tobias Klauser Signed-off-by: Shuah Khan --- tools/testing/selftests/vDSO/.gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore index 5eb64d41e541..a8dc51af5a9c 100644 --- a/tools/testing/selftests/vDSO/.gitignore +++ b/tools/testing/selftests/vDSO/.gitignore @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only vdso_test +vdso_test_abi +vdso_test_clock_getres +vdso_test_correctness vdso_test_gettimeofday vdso_test_getcpu vdso_standalone_test_x86 From df00d02989024d193a6efd1a85513a5658c6a10f Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 17 Dec 2020 17:32:35 +0100 Subject: [PATCH 321/547] selftests/vDSO: fix -Wformat warning in vdso_test_correctness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following -Wformat warnings in vdso_test_correctness.c: vdso_test_correctness.c: In function ‘test_one_clock_gettime64’: vdso_test_correctness.c:352:21: warning: format ‘%ld’ expects argument of type ‘long int’, but argument 3 has type ‘long long int’ [-Wformat=] 352 | printf("\t%llu.%09ld %llu.%09ld %llu.%09ld\n", | ~~~~^ | | | long int | %09lld 353 | (unsigned long long)start.tv_sec, start.tv_nsec, | ~~~~~~~~~~~~~ | | | long long int vdso_test_correctness.c:352:32: warning: format ‘%ld’ expects argument of type ‘long int’, but argument 5 has type ‘long long int’ [-Wformat=] 352 | printf("\t%llu.%09ld %llu.%09ld %llu.%09ld\n", | ~~~~^ | | | long int | %09lld 353 | (unsigned long long)start.tv_sec, start.tv_nsec, 354 | (unsigned long long)vdso.tv_sec, vdso.tv_nsec, | ~~~~~~~~~~~~ | | | long long int vdso_test_correctness.c:352:43: warning: format ‘%ld’ expects argument of type ‘long int’, but argument 7 has type ‘long long int’ [-Wformat=] The tv_sec member of __kernel_timespec is long long, both in uapi/linux/time_types.h and locally in vdso_test_correctness.c. Signed-off-by: Tobias Klauser Signed-off-by: Shuah Khan --- tools/testing/selftests/vDSO/vdso_test_correctness.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vDSO/vdso_test_correctness.c b/tools/testing/selftests/vDSO/vdso_test_correctness.c index 5029ef9b228c..c4aea794725a 100644 --- a/tools/testing/selftests/vDSO/vdso_test_correctness.c +++ b/tools/testing/selftests/vDSO/vdso_test_correctness.c @@ -349,7 +349,7 @@ static void test_one_clock_gettime64(int clock, const char *name) return; } - printf("\t%llu.%09ld %llu.%09ld %llu.%09ld\n", + printf("\t%llu.%09lld %llu.%09lld %llu.%09lld\n", (unsigned long long)start.tv_sec, start.tv_nsec, (unsigned long long)vdso.tv_sec, vdso.tv_nsec, (unsigned long long)end.tv_sec, end.tv_nsec); From 8cbebc4118b5933b3ae6351ceb433f75ac6b7c6b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Jan 2021 16:50:16 +0000 Subject: [PATCH 322/547] KVM: arm64: Replace KVM_ARM_PMU with HW_PERF_EVENTS KVM_ARM_PMU only existed for the benefit of 32bit ARM hosts, and makes no sense now that we are 64bit only. Get rid of it. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/Kconfig | 8 -------- arch/arm64/kvm/Makefile | 2 +- include/kvm/arm_pmu.h | 2 +- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 043756db8f6e..3964acf5451e 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -49,14 +49,6 @@ if KVM source "virt/kvm/Kconfig" -config KVM_ARM_PMU - bool "Virtual Performance Monitoring Unit (PMU) support" - depends on HW_PERF_EVENTS - default y - help - Adds support for a virtual Performance Monitoring Unit (PMU) in - virtual machines. - endif # KVM endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 60fd181df624..13b017284bf9 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -24,4 +24,4 @@ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \ vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ vgic/vgic-its.o vgic/vgic-debug.o -kvm-$(CONFIG_KVM_ARM_PMU) += pmu-emul.o +kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index fc85f50fa0e9..8dcb3e1477bc 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -13,7 +13,7 @@ #define ARMV8_PMU_CYCLE_IDX (ARMV8_PMU_MAX_COUNTERS - 1) #define ARMV8_PMU_MAX_COUNTER_PAIRS ((ARMV8_PMU_MAX_COUNTERS + 1) >> 1) -#ifdef CONFIG_KVM_ARM_PMU +#ifdef CONFIG_HW_PERF_EVENTS struct kvm_pmc { u8 idx; /* index into the pmu->pmc array */ From 0b884fe71f9ee6a5df35e677154256ea2099ebb8 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Mon, 14 Dec 2020 12:58:50 +0800 Subject: [PATCH 323/547] i2c: sprd: use a specific timeout to avoid system hang up issue If the i2c device SCL bus being pulled up due to some exception before message transfer done, the system cannot receive the completing interrupt signal any more, it would not exit waiting loop until MAX_SCHEDULE_TIMEOUT jiffies eclipse, that would make the system seemed hang up. To avoid that happen, this patch adds a specific timeout for message transfer. Fixes: 8b9ec0719834 ("i2c: Add Spreadtrum I2C controller driver") Signed-off-by: Linhua Xu Signed-off-by: Chunyan Zhang [wsa: changed errno to ETIMEDOUT] Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-sprd.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c index 19cda6742423..2917fecf6c80 100644 --- a/drivers/i2c/busses/i2c-sprd.c +++ b/drivers/i2c/busses/i2c-sprd.c @@ -72,6 +72,8 @@ /* timeout (ms) for pm runtime autosuspend */ #define SPRD_I2C_PM_TIMEOUT 1000 +/* timeout (ms) for transfer message */ +#define I2C_XFER_TIMEOUT 1000 /* SPRD i2c data structure */ struct sprd_i2c { @@ -244,6 +246,7 @@ static int sprd_i2c_handle_msg(struct i2c_adapter *i2c_adap, struct i2c_msg *msg, bool is_last_msg) { struct sprd_i2c *i2c_dev = i2c_adap->algo_data; + unsigned long time_left; i2c_dev->msg = msg; i2c_dev->buf = msg->buf; @@ -273,7 +276,10 @@ static int sprd_i2c_handle_msg(struct i2c_adapter *i2c_adap, sprd_i2c_opt_start(i2c_dev); - wait_for_completion(&i2c_dev->complete); + time_left = wait_for_completion_timeout(&i2c_dev->complete, + msecs_to_jiffies(I2C_XFER_TIMEOUT)); + if (!time_left) + return -ETIMEDOUT; return i2c_dev->err; } From 0b3ea2a06de1f52ea30865e227e109a5fd3b6214 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 21 Dec 2020 14:42:25 +0100 Subject: [PATCH 324/547] i2c: i801: Fix the i2c-mux gpiod_lookup_table not being properly terminated gpiod_add_lookup_table() expects the gpiod_lookup_table->table passed to it to be terminated with a zero-ed out entry. So we need to allocate one more entry then we will use. Fixes: d308dfbf62ef ("i2c: mux/i801: Switch to use descriptor passing") Signed-off-by: Hans de Goede Reviewed-by: Mika Westerberg Acked-by: Jean Delvare Reviewed-by: Linus Walleij Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-i801.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index ae90713443fa..877fe3733a42 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -1449,7 +1449,7 @@ static int i801_add_mux(struct i801_priv *priv) /* Register GPIO descriptor lookup table */ lookup = devm_kzalloc(dev, - struct_size(lookup, table, mux_config->n_gpios), + struct_size(lookup, table, mux_config->n_gpios + 1), GFP_KERNEL); if (!lookup) return -ENOMEM; From d9e44981739a96f1a468c13bbbd54ace378caf1c Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 28 Dec 2020 15:21:36 +0000 Subject: [PATCH 325/547] bareudp: set NETIF_F_LLTX flag Like other tunneling interfaces, the bareudp doesn't need TXLOCK. So, It is good to set the NETIF_F_LLTX flag to improve performance and to avoid lockdep's false-positive warning. Test commands: ip netns add A ip netns add B ip link add veth0 netns A type veth peer name veth1 netns B ip netns exec A ip link set veth0 up ip netns exec A ip a a 10.0.0.1/24 dev veth0 ip netns exec B ip link set veth1 up ip netns exec B ip a a 10.0.0.2/24 dev veth1 for i in {2..1} do let A=$i-1 ip netns exec A ip link add bareudp$i type bareudp \ dstport $i ethertype ip ip netns exec A ip link set bareudp$i up ip netns exec A ip a a 10.0.$i.1/24 dev bareudp$i ip netns exec A ip r a 10.0.$i.2 encap ip src 10.0.$A.1 \ dst 10.0.$A.2 via 10.0.$i.2 dev bareudp$i ip netns exec B ip link add bareudp$i type bareudp \ dstport $i ethertype ip ip netns exec B ip link set bareudp$i up ip netns exec B ip a a 10.0.$i.2/24 dev bareudp$i ip netns exec B ip r a 10.0.$i.1 encap ip src 10.0.$A.2 \ dst 10.0.$A.1 via 10.0.$i.1 dev bareudp$i done ip netns exec A ping 10.0.2.2 Splat looks like: [ 96.992803][ T822] ============================================ [ 96.993954][ T822] WARNING: possible recursive locking detected [ 96.995102][ T822] 5.10.0+ #819 Not tainted [ 96.995927][ T822] -------------------------------------------- [ 96.997091][ T822] ping/822 is trying to acquire lock: [ 96.998083][ T822] ffff88810f753898 (_xmit_NONE#2){+.-.}-{2:2}, at: __dev_queue_xmit+0x1f52/0x2960 [ 96.999813][ T822] [ 96.999813][ T822] but task is already holding lock: [ 97.001192][ T822] ffff88810c385498 (_xmit_NONE#2){+.-.}-{2:2}, at: __dev_queue_xmit+0x1f52/0x2960 [ 97.002908][ T822] [ 97.002908][ T822] other info that might help us debug this: [ 97.004401][ T822] Possible unsafe locking scenario: [ 97.004401][ T822] [ 97.005784][ T822] CPU0 [ 97.006407][ T822] ---- [ 97.007010][ T822] lock(_xmit_NONE#2); [ 97.007779][ T822] lock(_xmit_NONE#2); [ 97.008550][ T822] [ 97.008550][ T822] *** DEADLOCK *** [ 97.008550][ T822] [ 97.010057][ T822] May be due to missing lock nesting notation [ 97.010057][ T822] [ 97.011594][ T822] 7 locks held by ping/822: [ 97.012426][ T822] #0: ffff888109a144f0 (sk_lock-AF_INET){+.+.}-{0:0}, at: raw_sendmsg+0x12f7/0x2b00 [ 97.014191][ T822] #1: ffffffffbce2f5a0 (rcu_read_lock_bh){....}-{1:2}, at: ip_finish_output2+0x249/0x2020 [ 97.016045][ T822] #2: ffffffffbce2f5a0 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x1fd/0x2960 [ 97.017897][ T822] #3: ffff88810c385498 (_xmit_NONE#2){+.-.}-{2:2}, at: __dev_queue_xmit+0x1f52/0x2960 [ 97.019684][ T822] #4: ffffffffbce2f600 (rcu_read_lock){....}-{1:2}, at: bareudp_xmit+0x31b/0x3690 [bareudp] [ 97.021573][ T822] #5: ffffffffbce2f5a0 (rcu_read_lock_bh){....}-{1:2}, at: ip_finish_output2+0x249/0x2020 [ 97.023424][ T822] #6: ffffffffbce2f5a0 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x1fd/0x2960 [ 97.025259][ T822] [ 97.025259][ T822] stack backtrace: [ 97.026349][ T822] CPU: 3 PID: 822 Comm: ping Not tainted 5.10.0+ #819 [ 97.027609][ T822] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 97.029407][ T822] Call Trace: [ 97.030015][ T822] dump_stack+0x99/0xcb [ 97.030783][ T822] __lock_acquire.cold.77+0x149/0x3a9 [ 97.031773][ T822] ? stack_trace_save+0x81/0xa0 [ 97.032661][ T822] ? register_lock_class+0x1910/0x1910 [ 97.033673][ T822] ? register_lock_class+0x1910/0x1910 [ 97.034679][ T822] ? rcu_read_lock_sched_held+0x91/0xc0 [ 97.035697][ T822] ? rcu_read_lock_bh_held+0xa0/0xa0 [ 97.036690][ T822] lock_acquire+0x1b2/0x730 [ 97.037515][ T822] ? __dev_queue_xmit+0x1f52/0x2960 [ 97.038466][ T822] ? check_flags+0x50/0x50 [ 97.039277][ T822] ? netif_skb_features+0x296/0x9c0 [ 97.040226][ T822] ? validate_xmit_skb+0x29/0xb10 [ 97.041151][ T822] _raw_spin_lock+0x30/0x70 [ 97.041977][ T822] ? __dev_queue_xmit+0x1f52/0x2960 [ 97.042927][ T822] __dev_queue_xmit+0x1f52/0x2960 [ 97.043852][ T822] ? netdev_core_pick_tx+0x290/0x290 [ 97.044824][ T822] ? mark_held_locks+0xb7/0x120 [ 97.045712][ T822] ? lockdep_hardirqs_on_prepare+0x12c/0x3e0 [ 97.046824][ T822] ? __local_bh_enable_ip+0xa5/0xf0 [ 97.047771][ T822] ? ___neigh_create+0x12a8/0x1eb0 [ 97.048710][ T822] ? trace_hardirqs_on+0x41/0x120 [ 97.049626][ T822] ? ___neigh_create+0x12a8/0x1eb0 [ 97.050556][ T822] ? __local_bh_enable_ip+0xa5/0xf0 [ 97.051509][ T822] ? ___neigh_create+0x12a8/0x1eb0 [ 97.052443][ T822] ? check_chain_key+0x244/0x5f0 [ 97.053352][ T822] ? rcu_read_lock_bh_held+0x56/0xa0 [ 97.054317][ T822] ? ip_finish_output2+0x6ea/0x2020 [ 97.055263][ T822] ? pneigh_lookup+0x410/0x410 [ 97.056135][ T822] ip_finish_output2+0x6ea/0x2020 [ ... ] Acked-by: Guillaume Nault Fixes: 571912c69f0e ("net: UDP tunnel encapsulation module for tunnelling different protocols like MPLS, IP, NSH etc.") Signed-off-by: Taehee Yoo Link: https://lore.kernel.org/r/20201228152136.24215-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bareudp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 85ebd2b7e446..aea10196c222 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -534,6 +534,7 @@ static void bareudp_setup(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &bareudp_type); dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; dev->features |= NETIF_F_RXCSUM; + dev->features |= NETIF_F_LLTX; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; From 10ad3e998fa0c25315f27cf3002ff8b02dc31c38 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 28 Dec 2020 15:21:46 +0000 Subject: [PATCH 326/547] bareudp: Fix use of incorrect min_headroom size In the bareudp6_xmit_skb(), it calculates min_headroom. At that point, it uses struct iphdr, but it's not correct. So panic could occur. The struct ipv6hdr should be used. Test commands: ip netns add A ip netns add B ip link add veth0 netns A type veth peer name veth1 netns B ip netns exec A ip link set veth0 up ip netns exec A ip a a 2001:db8:0::1/64 dev veth0 ip netns exec B ip link set veth1 up ip netns exec B ip a a 2001:db8:0::2/64 dev veth1 for i in {10..1} do let A=$i-1 ip netns exec A ip link add bareudp$i type bareudp dstport $i \ ethertype 0x86dd ip netns exec A ip link set bareudp$i up ip netns exec A ip -6 a a 2001:db8:$i::1/64 dev bareudp$i ip netns exec A ip -6 r a 2001:db8:$i::2 encap ip6 src \ 2001:db8:$A::1 dst 2001:db8:$A::2 via 2001:db8:$i::2 \ dev bareudp$i ip netns exec B ip link add bareudp$i type bareudp dstport $i \ ethertype 0x86dd ip netns exec B ip link set bareudp$i up ip netns exec B ip -6 a a 2001:db8:$i::2/64 dev bareudp$i ip netns exec B ip -6 r a 2001:db8:$i::1 encap ip6 src \ 2001:db8:$A::2 dst 2001:db8:$A::1 via 2001:db8:$i::1 \ dev bareudp$i done ip netns exec A ping 2001:db8:7::2 Splat looks like: [ 66.436679][ C2] skbuff: skb_under_panic: text:ffffffff928614c8 len:454 put:14 head:ffff88810abb4000 data:ffff88810abb3ffa tail:0x1c0 end:0x3ec0 dev:veth0 [ 66.441626][ C2] ------------[ cut here ]------------ [ 66.443458][ C2] kernel BUG at net/core/skbuff.c:109! [ 66.445313][ C2] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI [ 66.447606][ C2] CPU: 2 PID: 913 Comm: ping Not tainted 5.10.0+ #819 [ 66.450251][ C2] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 66.453713][ C2] RIP: 0010:skb_panic+0x15d/0x15f [ 66.455345][ C2] Code: 98 fe 4c 8b 4c 24 10 53 8b 4d 70 45 89 e0 48 c7 c7 60 8b 78 93 41 57 41 56 41 55 48 8b 54 24 20 48 8b 74 24 28 e8 b5 40 f9 ff <0f> 0b 48 8b 6c 24 20 89 34 24 e8 08 c9 98 fe 8b 34 24 48 c7 c1 80 [ 66.462314][ C2] RSP: 0018:ffff888119209648 EFLAGS: 00010286 [ 66.464281][ C2] RAX: 0000000000000089 RBX: ffff888003159000 RCX: 0000000000000000 [ 66.467216][ C2] RDX: 0000000000000089 RSI: 0000000000000008 RDI: ffffed10232412c0 [ 66.469768][ C2] RBP: ffff88810a53d440 R08: ffffed102328018d R09: ffffed102328018d [ 66.472297][ C2] R10: ffff888119400c67 R11: ffffed102328018c R12: 000000000000000e [ 66.474833][ C2] R13: ffff88810abb3ffa R14: 00000000000001c0 R15: 0000000000003ec0 [ 66.477361][ C2] FS: 00007f37c0c72f00(0000) GS:ffff888119200000(0000) knlGS:0000000000000000 [ 66.480214][ C2] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 66.482296][ C2] CR2: 000055a058808570 CR3: 000000011039e002 CR4: 00000000003706e0 [ 66.484811][ C2] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 66.487793][ C2] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 66.490424][ C2] Call Trace: [ 66.491469][ C2] [ 66.492374][ C2] ? eth_header+0x28/0x190 [ 66.494054][ C2] ? eth_header+0x28/0x190 [ 66.495401][ C2] skb_push.cold.99+0x22/0x22 [ 66.496700][ C2] eth_header+0x28/0x190 [ 66.497867][ C2] neigh_resolve_output+0x3de/0x720 [ 66.499615][ C2] ? __neigh_update+0x7e8/0x20a0 [ 66.501176][ C2] __neigh_update+0x8bd/0x20a0 [ 66.502749][ C2] ndisc_update+0x34/0xc0 [ 66.504010][ C2] ndisc_recv_na+0x8da/0xb80 [ 66.505041][ C2] ? pndisc_redo+0x20/0x20 [ 66.505888][ C2] ? rcu_read_lock_sched_held+0xc0/0xc0 [ 66.506965][ C2] ndisc_rcv+0x3a0/0x470 [ 66.507797][ C2] icmpv6_rcv+0xad9/0x1b00 [ 66.508645][ C2] ip6_protocol_deliver_rcu+0xcd6/0x1560 [ 66.509719][ C2] ip6_input_finish+0x5b/0xf0 [ 66.510615][ C2] ip6_input+0xcd/0x2d0 [ 66.511406][ C2] ? ip6_input_finish+0xf0/0xf0 [ 66.512327][ C2] ? rcu_read_lock_held+0x91/0xa0 [ 66.513279][ C2] ? ip6_protocol_deliver_rcu+0x1560/0x1560 [ 66.514414][ C2] ipv6_rcv+0xe8/0x300 [ ... ] Acked-by: Guillaume Nault Fixes: 571912c69f0e ("net: UDP tunnel encapsulation module for tunnelling different protocols like MPLS, IP, NSH etc.") Signed-off-by: Taehee Yoo Link: https://lore.kernel.org/r/20201228152146.24270-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bareudp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index aea10196c222..708171c0d628 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -380,7 +380,7 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, goto free_dst; min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + - BAREUDP_BASE_HLEN + info->options_len + sizeof(struct iphdr); + BAREUDP_BASE_HLEN + info->options_len + sizeof(struct ipv6hdr); err = skb_cow_head(skb, min_headroom); if (unlikely(err)) From 01e31bea7e622f1890c274f4aaaaf8bccd296aa5 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Tue, 29 Dec 2020 10:01:48 +0800 Subject: [PATCH 327/547] vhost_net: fix ubuf refcount incorrectly when sendmsg fails Currently the vhost_zerocopy_callback() maybe be called to decrease the refcount when sendmsg fails in tun. The error handling in vhost handle_tx_zerocopy() will try to decrease the same refcount again. This is wrong. To fix this issue, we only call vhost_net_ubuf_put() when vq->heads[nvq->desc].len == VHOST_DMA_IN_PROGRESS. Fixes: bab632d69ee4 ("vhost: vhost TX zero-copy support") Signed-off-by: Yunjian Wang Acked-by: Willem de Bruijn Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://lore.kernel.org/r/1609207308-20544-1-git-send-email-wangyunjian@huawei.com Signed-off-by: Jakub Kicinski --- drivers/vhost/net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 531a00d703cd..c8784dfafdd7 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -863,6 +863,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) size_t len, total_len = 0; int err; struct vhost_net_ubuf_ref *ubufs; + struct ubuf_info *ubuf; bool zcopy_used; int sent_pkts = 0; @@ -895,9 +896,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy_used) { - struct ubuf_info *ubuf; ubuf = nvq->ubuf_info + nvq->upend_idx; - vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->callback = vhost_zerocopy_callback; @@ -927,7 +926,8 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) err = sock->ops->sendmsg(sock, &msg, len); if (unlikely(err < 0)) { if (zcopy_used) { - vhost_net_ubuf_put(ubufs); + if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS) + vhost_net_ubuf_put(ubufs); nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; } From 17e94567c57df3d9609e6bacaed9247c4f2629e2 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Tue, 29 Dec 2020 11:08:38 +0200 Subject: [PATCH 328/547] docs: networking: packet_mmap: fix formatting for C macros The citation of macro definitions should appear in a code block. Signed-off-by: Baruch Siach Link: https://lore.kernel.org/r/5cb47005e7a59b64299e038827e295822193384c.1609232919.git.baruch@tkos.co.il Signed-off-by: Jakub Kicinski --- Documentation/networking/packet_mmap.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/packet_mmap.rst b/Documentation/networking/packet_mmap.rst index 6c009ceb1183..f3646c80b019 100644 --- a/Documentation/networking/packet_mmap.rst +++ b/Documentation/networking/packet_mmap.rst @@ -437,7 +437,7 @@ and the following flags apply: Capture process ^^^^^^^^^^^^^^^ - from include/linux/if_packet.h +From include/linux/if_packet.h:: #define TP_STATUS_COPY (1 << 1) #define TP_STATUS_LOSING (1 << 2) From e4da63cda51f17fa1e86a10e84d47d692932530d Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Tue, 29 Dec 2020 11:08:39 +0200 Subject: [PATCH 329/547] docs: networking: packet_mmap: fix old config reference Before commit 889b8f964f2f ("packet: Kill CONFIG_PACKET_MMAP.") there used to be a CONFIG_PACKET_MMAP config symbol that depended on CONFIG_PACKET. The text still implies that PACKET_MMAP can be disabled. Remove that from the text, as well as reference to old kernel versions. Also, drop reference to broken link to information for pre 2.6.5 kernels. Make a slight working improvement (s/In/On/) while at it. Signed-off-by: Baruch Siach Link: https://lore.kernel.org/r/80089f3783372c8fd7833f28ce774a171b2ef252.1609232919.git.baruch@tkos.co.il Signed-off-by: Jakub Kicinski --- Documentation/networking/packet_mmap.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Documentation/networking/packet_mmap.rst b/Documentation/networking/packet_mmap.rst index f3646c80b019..500ef60b1b82 100644 --- a/Documentation/networking/packet_mmap.rst +++ b/Documentation/networking/packet_mmap.rst @@ -8,7 +8,7 @@ Abstract ======== This file documents the mmap() facility available with the PACKET -socket interface on 2.4/2.6/3.x kernels. This type of sockets is used for +socket interface. This type of sockets is used for i) capture network traffic with utilities like tcpdump, ii) transmit network traffic, or any other that needs raw @@ -25,12 +25,12 @@ Please send your comments to Why use PACKET_MMAP =================== -In Linux 2.4/2.6/3.x if PACKET_MMAP is not enabled, the capture process is very +Non PACKET_MMAP capture process (plain AF_PACKET) is very inefficient. It uses very limited buffers and requires one system call to capture each packet, it requires two if you want to get packet's timestamp (like libpcap always does). -In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size +On the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size configurable circular buffer mapped in user space that can be used to either send or receive packets. This way reading packets just needs to wait for them, most of the time there is no need to issue a single system call. Concerning @@ -252,8 +252,7 @@ PACKET_MMAP setting constraints In kernel versions prior to 2.4.26 (for the 2.4 branch) and 2.6.5 (2.6 branch), the PACKET_MMAP buffer could hold only 32768 frames in a 32 bit architecture or -16384 in a 64 bit architecture. For information on these kernel versions -see http://pusa.uv.es/~ulisses/packet_mmap/packet_mmap.pre-2.4.26_2.6.5.txt +16384 in a 64 bit architecture. Block size limit ---------------- From 862aecbd9569e563b979c0e23a908b43cda4b0b9 Mon Sep 17 00:00:00 2001 From: YANG LI Date: Wed, 30 Dec 2020 15:23:14 +0800 Subject: [PATCH 330/547] ibmvnic: fix: NULL pointer dereference. The error is due to dereference a null pointer in function reset_one_sub_crq_queue(): if (!scrq) { netdev_dbg(adapter->netdev, "Invalid scrq reset. irq (%d) or msgs(%p).\n", scrq->irq, scrq->msgs); return -EINVAL; } If the expression is true, scrq must be a null pointer and cannot dereference. Fixes: 9281cf2d5840 ("ibmvnic: avoid memset null scrq msgs") Signed-off-by: YANG LI Reported-by: Abaci Acked-by: Lijun Pan Link: https://lore.kernel.org/r/1609312994-121032-1-git-send-email-abaci-bugfix@linux.alibaba.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmvnic.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 91ebcde30292..9778c83150f1 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2981,9 +2981,7 @@ static int reset_one_sub_crq_queue(struct ibmvnic_adapter *adapter, int rc; if (!scrq) { - netdev_dbg(adapter->netdev, - "Invalid scrq reset. irq (%d) or msgs (%p).\n", - scrq->irq, scrq->msgs); + netdev_dbg(adapter->netdev, "Invalid scrq reset.\n"); return -EINVAL; } From 1d0d561ad1d7606bb745c1ed9478e7206860e56e Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 4 Jan 2021 10:38:02 +0000 Subject: [PATCH 331/547] net: macb: Correct usage of MACB_CAPS_CLK_HW_CHG flag A new flag MACB_CAPS_CLK_HW_CHG was added and all callers of macb_set_tx_clk were gated on the presence of this flag. - if (!clk) + if (!bp->tx_clk || !(bp->caps & MACB_CAPS_CLK_HW_CHG)) However the flag was not added to anything other than the new sama7g5_gem, turning that function call into a no op for all other systems. This breaks the networking on Zynq. The commit message adding this states: a new capability so that macb_set_tx_clock() to not be called for IPs having this capability This strongly implies that present of the flag was intended to skip the function not absence of the flag. Update the if statement to this effect, which repairs the existing users. Fixes: daafa1d33cc9 ("net: macb: add capability to not set the clock rate") Suggested-by: Andrew Lunn Signed-off-by: Charles Keepax Reviewed-by: Claudiu Beznea Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20210104103802.13091-1-ckeepax@opensource.cirrus.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index d5d910916c2e..814a5b10141d 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -467,7 +467,7 @@ static void macb_set_tx_clk(struct macb *bp, int speed) { long ferr, rate, rate_rounded; - if (!bp->tx_clk || !(bp->caps & MACB_CAPS_CLK_HW_CHG)) + if (!bp->tx_clk || (bp->caps & MACB_CAPS_CLK_HW_CHG)) return; switch (speed) { From 2ff2c7e274392871bfdee00ff2adbb8ebae5d240 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 30 Dec 2020 13:42:51 +0200 Subject: [PATCH 332/547] selftests: mlxsw: Set headroom size of correct port The test was setting the headroom size of the wrong port. This was not visible because of a firmware bug that canceled this bug. Set the headroom size of the correct port, so that the test will pass with both old and new firmware versions. Fixes: bfa804784e32 ("selftests: mlxsw: Add a PFC test") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Link: https://lore.kernel.org/r/20201230114251.394009-1-idosch@idosch.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh index 4d900bc1f76c..5c7700212f75 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh @@ -230,7 +230,7 @@ switch_create() __mlnx_qos -i $swp4 --pfc=0,1,0,0,0,0,0,0 >/dev/null # PG0 will get autoconfigured to Xoff, give PG1 arbitrarily 100K, which # is (-2*MTU) about 80K of delay provision. - __mlnx_qos -i $swp3 --buffer_size=0,$_100KB,0,0,0,0,0,0 >/dev/null + __mlnx_qos -i $swp4 --buffer_size=0,$_100KB,0,0,0,0,0,0 >/dev/null # bridges # ------- From cfd82dfc9799c53ef109343a23af006a0f6860a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 30 Dec 2020 16:24:51 +0100 Subject: [PATCH 333/547] net: usb: qmi_wwan: add Quectel EM160R-GL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New modem using ff/ff/30 for QCDM, ff/00/00 for AT and NMEA, and ff/ff/ff for RMNET/QMI. T: Bus=02 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=5000 MxCh= 0 D: Ver= 3.20 Cls=ef(misc ) Sub=02 Prot=01 MxPS= 9 #Cfgs= 1 P: Vendor=2c7c ProdID=0620 Rev= 4.09 S: Manufacturer=Quectel S: Product=EM160R-GL S: SerialNumber=e31cedc1 C:* #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=896mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=(none) E: Ad=81(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=82(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=(none) E: Ad=87(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=86(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=(none) E: Ad=88(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=8e(I) Atr=02(Bulk) MxPS=1024 Ivl=0ms E: Ad=0f(O) Atr=02(Bulk) MxPS=1024 Ivl=0ms Signed-off-by: Bjørn Mork Link: https://lore.kernel.org/r/20201230152451.245271-1-bjorn@mork.no Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index d166c321ee9b..af19513a9f75 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1013,6 +1013,7 @@ static const struct usb_device_id products[] = { {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0125)}, /* Quectel EC25, EC20 R2.0 Mini PCIe */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0306)}, /* Quectel EP06/EG06/EM06 */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0512)}, /* Quectel EG12/EM12 */ + {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0620)}, /* Quectel EM160R-GL */ {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0800)}, /* Quectel RM500Q-GL */ /* 3. Combined interface devices matching on interface number */ From e80bd76fbf563cc7ed8c9e9f3bbcdf59b0897f69 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 30 Dec 2020 19:33:34 +0100 Subject: [PATCH 334/547] r8169: work around power-saving bug on some chip versions A user reported failing network with RTL8168dp (a quite rare chip version). Realtek confirmed that few chip versions suffer from a PLL power-down hw bug. Fixes: 07df5bd874f0 ("r8169: power down chip in probe") Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/a1c39460-d533-7f9e-fa9d-2b8990b02426@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 46d8510b2fe2..a569abe7f5ef 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2207,7 +2207,8 @@ static void rtl_pll_power_down(struct rtl8169_private *tp) } switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_33: + case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_26: + case RTL_GIGA_MAC_VER_32 ... RTL_GIGA_MAC_VER_33: case RTL_GIGA_MAC_VER_37: case RTL_GIGA_MAC_VER_39: case RTL_GIGA_MAC_VER_43: @@ -2233,7 +2234,8 @@ static void rtl_pll_power_down(struct rtl8169_private *tp) static void rtl_pll_power_up(struct rtl8169_private *tp) { switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_33: + case RTL_GIGA_MAC_VER_25 ... RTL_GIGA_MAC_VER_26: + case RTL_GIGA_MAC_VER_32 ... RTL_GIGA_MAC_VER_33: case RTL_GIGA_MAC_VER_37: case RTL_GIGA_MAC_VER_39: case RTL_GIGA_MAC_VER_43: From b40f97b91a3b167ab22c9e9f1ef00b1615ff01e9 Mon Sep 17 00:00:00 2001 From: Xie He Date: Thu, 31 Dec 2020 09:43:31 -0800 Subject: [PATCH 335/547] net: lapb: Decrease the refcount of "struct lapb_cb" in lapb_device_event In lapb_device_event, lapb_devtostruct is called to get a reference to an object of "struct lapb_cb". lapb_devtostruct increases the refcount of the object and returns a pointer to it. However, we didn't decrease the refcount after we finished using the pointer. This patch fixes this problem. Fixes: a4989fa91110 ("net/lapb: support netdev events") Cc: Martin Schiller Signed-off-by: Xie He Link: https://lore.kernel.org/r/20201231174331.64539-1-xie.he.0141@gmail.com Signed-off-by: Jakub Kicinski --- net/lapb/lapb_iface.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 213ea7abc9ab..40961889e9c0 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -489,6 +489,7 @@ static int lapb_device_event(struct notifier_block *this, unsigned long event, break; } + lapb_put(lapb); return NOTIFY_DONE; } From c1a9ec7e5d577a9391660800c806c53287fca991 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 3 Jan 2021 02:25:43 +0100 Subject: [PATCH 336/547] net: dsa: lantiq_gswip: Enable GSWIP_MII_CFG_EN also for internal PHYs Enable GSWIP_MII_CFG_EN also for internal PHYs to make traffic flow. Without this the PHY link is detected properly and ethtool statistics for TX are increasing but there's no RX traffic coming in. Fixes: 14fceff4771e51 ("net: dsa: Add Lantiq / Intel DSA driver for vrx200") Suggested-by: Hauke Mehrtens Signed-off-by: Martin Blumenstingl Acked-by: Hauke Mehrtens Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/lantiq_gswip.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 09701c17f3f6..5d378c8026f0 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -1541,9 +1541,7 @@ static void gswip_phylink_mac_link_up(struct dsa_switch *ds, int port, { struct gswip_priv *priv = ds->priv; - /* Enable the xMII interface only for the external PHY */ - if (interface != PHY_INTERFACE_MODE_INTERNAL) - gswip_mii_mask_cfg(priv, 0, GSWIP_MII_CFG_EN, port); + gswip_mii_mask_cfg(priv, 0, GSWIP_MII_CFG_EN, port); } static void gswip_get_strings(struct dsa_switch *ds, int port, u32 stringset, From 709a3c9dff2a639966ae7d8ba6239d2b8aba036d Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 3 Jan 2021 02:25:44 +0100 Subject: [PATCH 337/547] net: dsa: lantiq_gswip: Fix GSWIP_MII_CFG(p) register access There is one GSWIP_MII_CFG register for each switch-port except the CPU port. The register offset for the first port is 0x0, 0x02 for the second, 0x04 for the third and so on. Update the driver to not only restrict the GSWIP_MII_CFG registers to ports 0, 1 and 5. Handle ports 0..5 instead but skip the CPU port. This means we are not overwriting the configuration for the third port (port two since we start counting from zero) with the settings for the sixth port (with number five) anymore. The GSWIP_MII_PCDU(p) registers are not updated because there's really only three (one for each of the following ports: 0, 1, 5). Fixes: 14fceff4771e51 ("net: dsa: Add Lantiq / Intel DSA driver for vrx200") Signed-off-by: Martin Blumenstingl Acked-by: Hauke Mehrtens Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/lantiq_gswip.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 5d378c8026f0..4b36d89bec06 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -92,9 +92,7 @@ GSWIP_MDIO_PHY_FDUP_MASK) /* GSWIP MII Registers */ -#define GSWIP_MII_CFG0 0x00 -#define GSWIP_MII_CFG1 0x02 -#define GSWIP_MII_CFG5 0x04 +#define GSWIP_MII_CFGp(p) (0x2 * (p)) #define GSWIP_MII_CFG_EN BIT(14) #define GSWIP_MII_CFG_LDCLKDIS BIT(12) #define GSWIP_MII_CFG_MODE_MIIP 0x0 @@ -392,17 +390,9 @@ static void gswip_mii_mask(struct gswip_priv *priv, u32 clear, u32 set, static void gswip_mii_mask_cfg(struct gswip_priv *priv, u32 clear, u32 set, int port) { - switch (port) { - case 0: - gswip_mii_mask(priv, clear, set, GSWIP_MII_CFG0); - break; - case 1: - gswip_mii_mask(priv, clear, set, GSWIP_MII_CFG1); - break; - case 5: - gswip_mii_mask(priv, clear, set, GSWIP_MII_CFG5); - break; - } + /* There's no MII_CFG register for the CPU port */ + if (!dsa_is_cpu_port(priv->ds, port)) + gswip_mii_mask(priv, clear, set, GSWIP_MII_CFGp(port)); } static void gswip_mii_mask_pcdu(struct gswip_priv *priv, u32 clear, u32 set, @@ -822,9 +812,8 @@ static int gswip_setup(struct dsa_switch *ds) gswip_mdio_mask(priv, 0xff, 0x09, GSWIP_MDIO_MDC_CFG1); /* Disable the xMII link */ - gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_EN, 0, 0); - gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_EN, 0, 1); - gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_EN, 0, 5); + for (i = 0; i < priv->hw_info->max_ports; i++) + gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_EN, 0, i); /* enable special tag insertion on cpu port */ gswip_switch_mask(priv, 0, GSWIP_FDMA_PCTRL_STEN, From 81b6d05ccad4f3d8a9dfb091fb46ad6978ee40e4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 4 Jan 2021 20:36:35 +0000 Subject: [PATCH 338/547] io_uring: synchronise IOPOLL on task_submit fail io_req_task_submit() might be called for IOPOLL, do the fail path under uring_lock to comply with IOPOLL synchronisation based solely on it. Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ca46f314640b..5be33fd8b6bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2126,15 +2126,16 @@ static void io_req_task_cancel(struct callback_head *cb) static void __io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + bool fail; - if (!__io_sq_thread_acquire_mm(ctx) && - !__io_sq_thread_acquire_files(ctx)) { - mutex_lock(&ctx->uring_lock); + fail = __io_sq_thread_acquire_mm(ctx) || + __io_sq_thread_acquire_files(ctx); + mutex_lock(&ctx->uring_lock); + if (!fail) __io_queue_sqe(req, NULL); - mutex_unlock(&ctx->uring_lock); - } else { + else __io_req_task_cancel(req, -EFAULT); - } + mutex_unlock(&ctx->uring_lock); } static void io_req_task_submit(struct callback_head *cb) From 6c503150ae33ee19036255cfda0998463613352c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 4 Jan 2021 20:36:36 +0000 Subject: [PATCH 339/547] io_uring: patch up IOPOLL overflow_flush sync IOPOLL skips completion locking but keeps it under uring_lock, thus io_cqring_overflow_flush() and so io_cqring_events() need additional locking with uring_lock in some cases for IOPOLL. Remove __io_cqring_overflow_flush() from io_cqring_events(), introduce a wrapper around flush doing needed synchronisation and call it by hand. Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 78 +++++++++++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5be33fd8b6bc..445035b24a50 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1713,9 +1713,9 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) } /* Returns true if there are no backlogged entries after the flush */ -static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, - struct task_struct *tsk, - struct files_struct *files) +static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, + struct task_struct *tsk, + struct files_struct *files) { struct io_rings *rings = ctx->rings; struct io_kiocb *req, *tmp; @@ -1768,6 +1768,20 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, return all_flushed; } +static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, + struct task_struct *tsk, + struct files_struct *files) +{ + if (test_bit(0, &ctx->cq_check_overflow)) { + /* iopoll syncs against uring_lock, not completion_lock */ + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_lock(&ctx->uring_lock); + __io_cqring_overflow_flush(ctx, force, tsk, files); + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_unlock(&ctx->uring_lock); + } +} + static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) { struct io_ring_ctx *ctx = req->ctx; @@ -2314,20 +2328,8 @@ static void io_double_put_req(struct io_kiocb *req) io_free_req(req); } -static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) +static unsigned io_cqring_events(struct io_ring_ctx *ctx) { - if (test_bit(0, &ctx->cq_check_overflow)) { - /* - * noflush == true is from the waitqueue handler, just ensure - * we wake up the task, and the next invocation will flush the - * entries. We cannot safely to it from here. - */ - if (noflush) - return -1U; - - io_cqring_overflow_flush(ctx, false, NULL, NULL); - } - /* See comment at the top of this file */ smp_rmb(); return __io_cqring_events(ctx); @@ -2552,7 +2554,9 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - if (io_cqring_events(ctx, false)) + if (test_bit(0, &ctx->cq_check_overflow)) + __io_cqring_overflow_flush(ctx, false, NULL, NULL); + if (io_cqring_events(ctx)) break; /* @@ -6827,7 +6831,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) /* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { - if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) + if (!__io_cqring_overflow_flush(ctx, false, NULL, NULL)) return -EBUSY; } @@ -7090,7 +7094,7 @@ struct io_wait_queue { unsigned nr_timeouts; }; -static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush) +static inline bool io_should_wake(struct io_wait_queue *iowq) { struct io_ring_ctx *ctx = iowq->ctx; @@ -7099,7 +7103,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush) * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ - return io_cqring_events(ctx, noflush) >= iowq->to_wait || + return io_cqring_events(ctx) >= iowq->to_wait || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } @@ -7109,11 +7113,13 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); - /* use noflush == true, as we can't safely rely on locking context */ - if (!io_should_wake(iowq, true)) - return -1; - - return autoremove_wake_function(curr, mode, wake_flags, key); + /* + * Cannot safely flush overflowed CQEs from here, ensure we wake up + * the task, and the next invocation will do it. + */ + if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow)) + return autoremove_wake_function(curr, mode, wake_flags, key); + return -1; } static int io_run_task_work_sig(void) @@ -7150,7 +7156,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, int ret = 0; do { - if (io_cqring_events(ctx, false) >= min_events) + io_cqring_overflow_flush(ctx, false, NULL, NULL); + if (io_cqring_events(ctx) >= min_events) return 0; if (!io_run_task_work()) break; @@ -7178,6 +7185,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); trace_io_uring_cqring_wait(ctx, min_events); do { + io_cqring_overflow_flush(ctx, false, NULL, NULL); prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); /* make sure we run task_work before checking for signals */ @@ -7186,8 +7194,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, continue; else if (ret < 0) break; - if (io_should_wake(&iowq, false)) + if (io_should_wake(&iowq)) break; + if (test_bit(0, &ctx->cq_check_overflow)) + continue; if (uts) { timeout = schedule_timeout(timeout); if (timeout == 0) { @@ -8625,7 +8635,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) smp_rmb(); if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; - if (io_cqring_events(ctx, false)) + io_cqring_overflow_flush(ctx, false, NULL, NULL); + if (io_cqring_events(ctx)) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -8683,7 +8694,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) /* if force is set, the ring is going away. always drop after that */ ctx->cq_overflow_flushed = 1; if (ctx->rings) - io_cqring_overflow_flush(ctx, true, NULL, NULL); + __io_cqring_overflow_flush(ctx, true, NULL, NULL); mutex_unlock(&ctx->uring_lock); io_kill_timeouts(ctx, NULL, NULL); @@ -8857,9 +8868,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, } io_cancel_defer_files(ctx, task, files); - io_ring_submit_lock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); io_cqring_overflow_flush(ctx, true, task, files); - io_ring_submit_unlock(ctx, (ctx->flags & IORING_SETUP_IOPOLL)); if (!files) __io_uring_cancel_task_requests(ctx, task); @@ -9195,13 +9204,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { - if (!list_empty_careful(&ctx->cq_overflow_list)) { - bool needs_lock = ctx->flags & IORING_SETUP_IOPOLL; + io_cqring_overflow_flush(ctx, false, NULL, NULL); - io_ring_submit_lock(ctx, needs_lock); - io_cqring_overflow_flush(ctx, false, NULL, NULL); - io_ring_submit_unlock(ctx, needs_lock); - } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) From de7f1d9e99d8b99e4e494ad8fcd91f0c4c5c9357 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 4 Jan 2021 20:43:29 +0000 Subject: [PATCH 340/547] io_uring: drop file refs after task cancel io_uring fds marked O_CLOEXEC and we explicitly cancel all requests before going through exec, so we don't want to leave task's file references to not our anymore io_uring instances. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 445035b24a50..85de42c42433 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8958,6 +8958,15 @@ static void io_uring_attempt_task_drop(struct file *file) io_uring_del_task_file(file); } +static void io_uring_remove_task_files(struct io_uring_task *tctx) +{ + struct file *file; + unsigned long index; + + xa_for_each(&tctx->xa, index, file) + io_uring_del_task_file(file); +} + void __io_uring_files_cancel(struct files_struct *files) { struct io_uring_task *tctx = current->io_uring; @@ -8966,16 +8975,12 @@ void __io_uring_files_cancel(struct files_struct *files) /* make sure overflow events are dropped */ atomic_inc(&tctx->in_idle); - - xa_for_each(&tctx->xa, index, file) { - struct io_ring_ctx *ctx = file->private_data; - - io_uring_cancel_task_requests(ctx, files); - if (files) - io_uring_del_task_file(file); - } - + xa_for_each(&tctx->xa, index, file) + io_uring_cancel_task_requests(file->private_data, files); atomic_dec(&tctx->in_idle); + + if (files) + io_uring_remove_task_files(tctx); } static s64 tctx_inflight(struct io_uring_task *tctx) @@ -9038,6 +9043,8 @@ void __io_uring_task_cancel(void) } while (1); atomic_dec(&tctx->in_idle); + + io_uring_remove_task_files(tctx); } static int io_uring_flush(struct file *file, void *data) From 90df08538c07b7135703358a0c8c08d97889a704 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 4 Jan 2021 20:43:30 +0000 Subject: [PATCH 341/547] io_uring: cancel more aggressively in exit_work While io_ring_exit_work() is running new requests of all sorts may be issued, so it should do a bit more to cancel them, otherwise they may just get stuck. e.g. in io-wq, in poll lists, etc. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 85de42c42433..5bccb235271f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -992,6 +992,9 @@ enum io_mem_account { ACCT_PINNED, }; +static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, + struct task_struct *task); + static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node); static struct fixed_file_ref_node *alloc_fixed_file_ref_node( struct io_ring_ctx *ctx); @@ -8675,7 +8678,7 @@ static void io_ring_exit_work(struct work_struct *work) * as nobody else will be looking for them. */ do { - io_iopoll_try_reap_events(ctx); + __io_uring_cancel_task_requests(ctx, NULL); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); } @@ -8830,9 +8833,11 @@ static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, enum io_wq_cancel cret; bool ret = false; - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, &cancel, true); - if (cret != IO_WQ_CANCEL_NOTFOUND) - ret = true; + if (ctx->io_wq) { + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, + &cancel, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } /* SQPOLL thread does its own polling */ if (!(ctx->flags & IORING_SETUP_SQPOLL)) { From 75353bcd2184010f08a3ed2f0da019bd9d604e1e Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 24 Dec 2020 15:13:57 +0000 Subject: [PATCH 342/547] drm/i915: clear the shadow batch The shadow batch is an internal object, which doesn't have any page clearing, and since the batch_len can be smaller than the object, we should take care to clear it. Testcase: igt/gen9_exec_parse/shadow-peek Fixes: 4f7af1948abc ("drm/i915: Support ro ppgtt mapped cmdparser shadow buffers") Signed-off-by: Matthew Auld Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201224151358.401345-1-matthew.auld@intel.com Cc: stable@vger.kernel.org (cherry picked from commit eeb52ee6c4a429ec301faf1dc48988744960786e) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_cmd_parser.c | 27 +++++++++----------------- 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 93265951fdbb..b0899b665e85 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -1166,7 +1166,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, } } if (IS_ERR(src)) { - unsigned long x, n; + unsigned long x, n, remain; void *ptr; /* @@ -1177,14 +1177,15 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, * We don't care about copying too much here as we only * validate up to the end of the batch. */ + remain = length; if (!(dst_obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ)) - length = round_up(length, + remain = round_up(remain, boot_cpu_data.x86_clflush_size); ptr = dst; x = offset_in_page(offset); - for (n = offset >> PAGE_SHIFT; length; n++) { - int len = min(length, PAGE_SIZE - x); + for (n = offset >> PAGE_SHIFT; remain; n++) { + int len = min(remain, PAGE_SIZE - x); src = kmap_atomic(i915_gem_object_get_page(src_obj, n)); if (needs_clflush) @@ -1193,13 +1194,15 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, kunmap_atomic(src); ptr += len; - length -= len; + remain -= len; x = 0; } } i915_gem_object_unpin_pages(src_obj); + memset32(dst + length, 0, (dst_obj->base.size - length) / sizeof(u32)); + /* dst_obj is returned with vmap pinned */ return dst; } @@ -1392,11 +1395,6 @@ static unsigned long *alloc_whitelist(u32 batch_length) #define LENGTH_BIAS 2 -static bool shadow_needs_clflush(struct drm_i915_gem_object *obj) -{ - return !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE); -} - /** * intel_engine_cmd_parser() - parse a batch buffer for privilege violations * @engine: the engine on which the batch is to execute @@ -1538,16 +1536,9 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, ret = 0; /* allow execution */ } } - - if (shadow_needs_clflush(shadow->obj)) - drm_clflush_virt_range(batch_end, 8); } - if (shadow_needs_clflush(shadow->obj)) { - void *ptr = page_mask_bits(shadow->obj->mm.mapping); - - drm_clflush_virt_range(ptr, (void *)(cmd + 1) - ptr); - } + i915_gem_object_flush_map(shadow->obj); if (!IS_ERR_OR_NULL(jump_whitelist)) kfree(jump_whitelist); From 641382e9b44fba81a0778e1914ee35b8471121f9 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 24 Dec 2020 15:13:58 +0000 Subject: [PATCH 343/547] drm/i915: clear the gpu reloc batch The reloc batch is short lived but can exist in the user visible ppGTT, and since it's backed by an internal object, which lacks page clearing, we should take care to clear it upfront. Signed-off-by: Matthew Auld Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201224151358.401345-2-matthew.auld@intel.com Cc: stable@vger.kernel.org (cherry picked from commit 26ebc511e799f621357982ccc37a7987a56a00f4) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index bcc80f428172..bd3046e5a934 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -1046,7 +1046,7 @@ static void reloc_gpu_flush(struct i915_execbuffer *eb, struct reloc_cache *cach GEM_BUG_ON(cache->rq_size >= obj->base.size / sizeof(u32)); cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END; - __i915_gem_object_flush_map(obj, 0, sizeof(u32) * (cache->rq_size + 1)); + i915_gem_object_flush_map(obj); i915_gem_object_unpin_map(obj); intel_gt_chipset_flush(cache->rq->engine->gt); @@ -1296,6 +1296,8 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb, goto err_pool; } + memset32(cmd, 0, pool->obj->base.size / sizeof(u32)); + batch = i915_vma_instance(pool->obj, vma->vm, NULL); if (IS_ERR(batch)) { err = PTR_ERR(batch); From 557862535c2cad6de6f6fb12312b7a6d09c06407 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 29 Dec 2020 12:08:28 +0000 Subject: [PATCH 344/547] drm/i915/gt: Define guc firmware blob for older Cometlakes When splitting the Coffeelake define to also identify Cometlakes, I missed the double fw_def for Coffeelake. That is only newer Cometlakes use the cml specific guc firmware, older Cometlakes should use kbl firmware. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2859 Fixes: 5f4ae2704d59 ("drm/i915: Identify Cometlake platform") Signed-off-by: Chris Wilson Cc: # v5.9+ Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201229120828.29931-1-chris@chris-wilson.co.uk (cherry picked from commit 70960ab27542d8dc322f909f516391f331fbd3f1) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c index 180c23e2e25e..602f1a0bc587 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c @@ -53,6 +53,7 @@ void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw, fw_def(ELKHARTLAKE, 0, guc_def(ehl, 49, 0, 1), huc_def(ehl, 9, 0, 0)) \ fw_def(ICELAKE, 0, guc_def(icl, 49, 0, 1), huc_def(icl, 9, 0, 0)) \ fw_def(COMETLAKE, 5, guc_def(cml, 49, 0, 1), huc_def(cml, 4, 0, 0)) \ + fw_def(COMETLAKE, 0, guc_def(kbl, 49, 0, 1), huc_def(kbl, 4, 0, 0)) \ fw_def(COFFEELAKE, 0, guc_def(kbl, 49, 0, 1), huc_def(kbl, 4, 0, 0)) \ fw_def(GEMINILAKE, 0, guc_def(glk, 49, 0, 1), huc_def(glk, 4, 0, 0)) \ fw_def(KABYLAKE, 0, guc_def(kbl, 49, 0, 1), huc_def(kbl, 4, 0, 0)) \ From 9397d66212cdf7a21c66523f1583e5d63a609e84 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 30 Dec 2020 20:23:09 +0000 Subject: [PATCH 345/547] drm/i915/dp: Track pm_qos per connector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since multiple connectors may run intel_dp_aux_xfer conncurrently, a single global pm_qos does not suffice. (One connector may disable the dma-latency boost prematurely while the second is still depending on it.) Instead of a single global pm_qos, track the pm_qos request for each intel_dp. v2: Move the pm_qos setup/teardown to intel_dp_aux_init/fini Fixes: 9ee32fea5fe8 ("drm/i915: irq-drive the dp aux communication") Signed-off-by: Chris Wilson Cc: Ville Syrjälä Cc: Imre Deak Reviewed-by: Imre Deak Link: https://patchwork.freedesktop.org/patch/msgid/20201230202309.23982-1-chris@chris-wilson.co.uk (cherry picked from commit b3304591f14b437b6bccd8dbff06006c11837031) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_display_types.h | 3 +++ drivers/gpu/drm/i915/display/intel_dp.c | 8 ++++++-- drivers/gpu/drm/i915/i915_drv.c | 5 ----- drivers/gpu/drm/i915/i915_drv.h | 3 --- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h index ce82d654d0f2..34d78c654df3 100644 --- a/drivers/gpu/drm/i915/display/intel_display_types.h +++ b/drivers/gpu/drm/i915/display/intel_display_types.h @@ -1436,6 +1436,9 @@ struct intel_dp { bool ycbcr_444_to_420; } dfp; + /* To control wakeup latency, e.g. for irq-driven dp aux transfers. */ + struct pm_qos_request pm_qos; + /* Display stream compression testing */ bool force_dsc_en; diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 2165398d2c7c..37f1a10fd021 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -1489,7 +1489,7 @@ intel_dp_aux_xfer(struct intel_dp *intel_dp, * lowest possible wakeup latency and so prevent the cpu from going into * deep sleep states. */ - cpu_latency_qos_update_request(&i915->pm_qos, 0); + cpu_latency_qos_update_request(&intel_dp->pm_qos, 0); intel_dp_check_edp(intel_dp); @@ -1622,7 +1622,7 @@ done: ret = recv_bytes; out: - cpu_latency_qos_update_request(&i915->pm_qos, PM_QOS_DEFAULT_VALUE); + cpu_latency_qos_update_request(&intel_dp->pm_qos, PM_QOS_DEFAULT_VALUE); if (vdd) edp_panel_vdd_off(intel_dp, false); @@ -1898,6 +1898,9 @@ static i915_reg_t tgl_aux_data_reg(struct intel_dp *intel_dp, int index) static void intel_dp_aux_fini(struct intel_dp *intel_dp) { + if (cpu_latency_qos_request_active(&intel_dp->pm_qos)) + cpu_latency_qos_remove_request(&intel_dp->pm_qos); + kfree(intel_dp->aux.name); } @@ -1950,6 +1953,7 @@ intel_dp_aux_init(struct intel_dp *intel_dp) encoder->base.name); intel_dp->aux.transfer = intel_dp_aux_transfer; + cpu_latency_qos_add_request(&intel_dp->pm_qos, PM_QOS_DEFAULT_VALUE); } bool intel_dp_source_supports_hbr2(struct intel_dp *intel_dp) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 320856b665a1..88ad754962af 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -578,8 +578,6 @@ static int i915_driver_hw_probe(struct drm_i915_private *dev_priv) pci_set_master(pdev); - cpu_latency_qos_add_request(&dev_priv->pm_qos, PM_QOS_DEFAULT_VALUE); - intel_gt_init_workarounds(dev_priv); /* On the 945G/GM, the chipset reports the MSI capability on the @@ -626,7 +624,6 @@ static int i915_driver_hw_probe(struct drm_i915_private *dev_priv) err_msi: if (pdev->msi_enabled) pci_disable_msi(pdev); - cpu_latency_qos_remove_request(&dev_priv->pm_qos); err_mem_regions: intel_memory_regions_driver_release(dev_priv); err_ggtt: @@ -648,8 +645,6 @@ static void i915_driver_hw_remove(struct drm_i915_private *dev_priv) if (pdev->msi_enabled) pci_disable_msi(pdev); - - cpu_latency_qos_remove_request(&dev_priv->pm_qos); } /** diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 0a3ee4f9dc0a..632c713227dc 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -891,9 +891,6 @@ struct drm_i915_private { bool display_irqs_enabled; - /* To control wakeup latency, e.g. for irq-driven dp aux transfers. */ - struct pm_qos_request pm_qos; - /* Sideband mailbox protection */ struct mutex sb_lock; struct pm_qos_request sb_qos; From 05f6f7271a38c482c5021967433f7b698e102c45 Mon Sep 17 00:00:00 2001 From: Qii Wang Date: Thu, 24 Dec 2020 20:26:07 +0800 Subject: [PATCH 346/547] i2c: mediatek: Fix apdma and i2c hand-shake timeout With the apdma remove hand-shake signal, it requirs special operation timing to reset i2c manually, otherwise the interrupt will not be triggered, i2c transmission will be timeout. Fixes: 8426fe70cfa4("i2c: mediatek: Add apdma sync in i2c driver") Signed-off-by: Qii Wang Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-mt65xx.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/i2c/busses/i2c-mt65xx.c b/drivers/i2c/busses/i2c-mt65xx.c index 33de99b7bc20..0818d3e50734 100644 --- a/drivers/i2c/busses/i2c-mt65xx.c +++ b/drivers/i2c/busses/i2c-mt65xx.c @@ -38,6 +38,7 @@ #define I2C_IO_CONFIG_OPEN_DRAIN 0x0003 #define I2C_IO_CONFIG_PUSH_PULL 0x0000 #define I2C_SOFT_RST 0x0001 +#define I2C_HANDSHAKE_RST 0x0020 #define I2C_FIFO_ADDR_CLR 0x0001 #define I2C_DELAY_LEN 0x0002 #define I2C_TIME_CLR_VALUE 0x0000 @@ -45,6 +46,7 @@ #define I2C_WRRD_TRANAC_VALUE 0x0002 #define I2C_RD_TRANAC_VALUE 0x0001 #define I2C_SCL_MIS_COMP_VALUE 0x0000 +#define I2C_CHN_CLR_FLAG 0x0000 #define I2C_DMA_CON_TX 0x0000 #define I2C_DMA_CON_RX 0x0001 @@ -54,7 +56,9 @@ #define I2C_DMA_START_EN 0x0001 #define I2C_DMA_INT_FLAG_NONE 0x0000 #define I2C_DMA_CLR_FLAG 0x0000 +#define I2C_DMA_WARM_RST 0x0001 #define I2C_DMA_HARD_RST 0x0002 +#define I2C_DMA_HANDSHAKE_RST 0x0004 #define MAX_SAMPLE_CNT_DIV 8 #define MAX_STEP_CNT_DIV 64 @@ -475,11 +479,24 @@ static void mtk_i2c_init_hw(struct mtk_i2c *i2c) { u16 control_reg; - writel(I2C_DMA_HARD_RST, i2c->pdmabase + OFFSET_RST); - udelay(50); - writel(I2C_DMA_CLR_FLAG, i2c->pdmabase + OFFSET_RST); - - mtk_i2c_writew(i2c, I2C_SOFT_RST, OFFSET_SOFTRESET); + if (i2c->dev_comp->dma_sync) { + writel(I2C_DMA_WARM_RST, i2c->pdmabase + OFFSET_RST); + udelay(10); + writel(I2C_DMA_CLR_FLAG, i2c->pdmabase + OFFSET_RST); + udelay(10); + writel(I2C_DMA_HANDSHAKE_RST | I2C_DMA_HARD_RST, + i2c->pdmabase + OFFSET_RST); + mtk_i2c_writew(i2c, I2C_HANDSHAKE_RST | I2C_SOFT_RST, + OFFSET_SOFTRESET); + udelay(10); + writel(I2C_DMA_CLR_FLAG, i2c->pdmabase + OFFSET_RST); + mtk_i2c_writew(i2c, I2C_CHN_CLR_FLAG, OFFSET_SOFTRESET); + } else { + writel(I2C_DMA_HARD_RST, i2c->pdmabase + OFFSET_RST); + udelay(50); + writel(I2C_DMA_CLR_FLAG, i2c->pdmabase + OFFSET_RST); + mtk_i2c_writew(i2c, I2C_SOFT_RST, OFFSET_SOFTRESET); + } /* Set ioconfig */ if (i2c->use_push_pull) From d1c5246e08eb64991001d97a3bd119c93edbc79a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 2 Dec 2020 22:28:12 -0800 Subject: [PATCH 347/547] x86/mm: Fix leak of pmd ptlock Commit 28ee90fe6048 ("x86/mm: implement free pmd/pte page interfaces") introduced a new location where a pmd was released, but neglected to run the pmd page destructor. In fact, this happened previously for a different pmd release path and was fixed by commit: c283610e44ec ("x86, mm: do not leak page->ptl for pmd page tables"). This issue was hidden until recently because the failure mode is silent, but commit: b2b29d6d0119 ("mm: account PMD tables like PTE tables") turns the failure mode into this signature: BUG: Bad page state in process lt-pmem-ns pfn:15943d page:000000007262ed7b refcount:0 mapcount:-1024 mapping:0000000000000000 index:0x0 pfn:0x15943d flags: 0xaffff800000000() raw: 00affff800000000 dead000000000100 0000000000000000 0000000000000000 raw: 0000000000000000 ffff913a029bcc08 00000000fffffbff 0000000000000000 page dumped because: nonzero mapcount [..] dump_stack+0x8b/0xb0 bad_page.cold+0x63/0x94 free_pcp_prepare+0x224/0x270 free_unref_page+0x18/0xd0 pud_free_pmd_page+0x146/0x160 ioremap_pud_range+0xe3/0x350 ioremap_page_range+0x108/0x160 __ioremap_caller.constprop.0+0x174/0x2b0 ? memremap+0x7a/0x110 memremap+0x7a/0x110 devm_memremap+0x53/0xa0 pmem_attach_disk+0x4ed/0x530 [nd_pmem] ? __devm_release_region+0x52/0x80 nvdimm_bus_probe+0x85/0x210 [libnvdimm] Given this is a repeat occurrence it seemed prudent to look for other places where this destructor might be missing and whether a better helper is needed. try_to_free_pmd_page() looks like a candidate, but testing with setting up and tearing down pmd mappings via the dax unit tests is thus far not triggering the failure. As for a better helper pmd_free() is close, but it is a messy fit due to requiring an @mm arg. Also, ___pmd_free_tlb() wants to call paravirt_tlb_remove_table() instead of free_page(), so open-coded pgtable_pmd_page_dtor() seems the best way forward for now. Debugged together with Matthew Wilcox . Fixes: 28ee90fe6048 ("x86/mm: implement free pmd/pte page interfaces") Signed-off-by: Dan Williams Signed-off-by: Borislav Petkov Tested-by: Yi Zhang Acked-by: Peter Zijlstra (Intel) Cc: Link: https://lkml.kernel.org/r/160697689204.605323.17629854984697045602.stgit@dwillia2-desk3.amr.corp.intel.com --- arch/x86/mm/pgtable.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index dfd82f51ba66..f6a9e2e36642 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -829,6 +829,8 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) } free_page((unsigned long)pmd_sv); + + pgtable_pmd_page_dtor(virt_to_page(pmd)); free_page((unsigned long)pmd); return 1; From 311bea3cb9ee20ef150ca76fc60a592bf6b159f5 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 17 Dec 2020 16:24:32 -0800 Subject: [PATCH 348/547] arm64: link with -z norelro for LLD or aarch64-elf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With GNU binutils 2.35+, linking with BFD produces warnings for vmlinux: aarch64-linux-gnu-ld: warning: -z norelro ignored BFD can produce this warning when the target emulation mode does not support RELRO program headers, and -z relro or -z norelro is passed. Alan Modra clarifies: The default linker emulation for an aarch64-linux ld.bfd is -maarch64linux, the default for an aarch64-elf linker is -maarch64elf. They are not equivalent. If you choose -maarch64elf you get an emulation that doesn't support -z relro. The ARCH=arm64 kernel prefers -maarch64elf, but may fall back to -maarch64linux based on the toolchain configuration. LLD will always create RELRO program header regardless of target emulation. To avoid the above warning when linking with BFD, pass -z norelro only when linking with LLD or with -maarch64linux. Fixes: 3b92fa7485eb ("arm64: link with -z norelro regardless of CONFIG_RELOCATABLE") Fixes: 3bbd3db86470 ("arm64: relocatable: fix inconsistencies in linker script and options") Cc: # 5.0.x- Reported-by: kernelci.org bot Reported-by: Quentin Perret Signed-off-by: Nick Desaulniers Reviewed-by: Nathan Chancellor Acked-by: Ard Biesheuvel Cc: Alan Modra Cc: Fāng-ruì Sòng Link: https://lore.kernel.org/r/20201218002432.788499-1-ndesaulniers@google.com Signed-off-by: Catalin Marinas --- arch/arm64/Makefile | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 6be9b3750250..90309208bb28 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -10,7 +10,7 @@ # # Copyright (C) 1995-2001 by Russell King -LDFLAGS_vmlinux :=--no-undefined -X -z norelro +LDFLAGS_vmlinux :=--no-undefined -X ifeq ($(CONFIG_RELOCATABLE), y) # Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour @@ -115,16 +115,20 @@ KBUILD_CPPFLAGS += -mbig-endian CHECKFLAGS += -D__AARCH64EB__ # Prefer the baremetal ELF build target, but not all toolchains include # it so fall back to the standard linux version if needed. -KBUILD_LDFLAGS += -EB $(call ld-option, -maarch64elfb, -maarch64linuxb) +KBUILD_LDFLAGS += -EB $(call ld-option, -maarch64elfb, -maarch64linuxb -z norelro) UTS_MACHINE := aarch64_be else KBUILD_CPPFLAGS += -mlittle-endian CHECKFLAGS += -D__AARCH64EL__ # Same as above, prefer ELF but fall back to linux target if needed. -KBUILD_LDFLAGS += -EL $(call ld-option, -maarch64elf, -maarch64linux) +KBUILD_LDFLAGS += -EL $(call ld-option, -maarch64elf, -maarch64linux -z norelro) UTS_MACHINE := aarch64 endif +ifeq ($(CONFIG_LD_IS_LLD), y) +KBUILD_LDFLAGS += -z norelro +endif + CHECKFLAGS += -D__aarch64__ ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS),y) From 96ebc9c871d8a28fb22aa758dd9188a4732df482 Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Mon, 4 Jan 2021 20:07:15 -0800 Subject: [PATCH 349/547] usb: uas: Add PNY USB Portable SSD to unusual_uas Here's another variant PNY Pro Elite USB 3.1 Gen 2 portable SSD that hangs and doesn't respond to ATA_1x pass-through commands. If it doesn't support these commands, it should respond properly to the host. Add it to the unusual uas list to be able to move forward with other operations. Cc: stable@vger.kernel.org Reviewed-by: Hans de Goede Acked-by: Oliver Neukum Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/2edc7af892d0913bf06f5b35e49ec463f03d5ed8.1609819418.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_uas.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h index 870e9cf3d5dc..f9677a5ec31b 100644 --- a/drivers/usb/storage/unusual_uas.h +++ b/drivers/usb/storage/unusual_uas.h @@ -90,6 +90,13 @@ UNUSUAL_DEV(0x152d, 0x0578, 0x0000, 0x9999, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_BROKEN_FUA), +/* Reported-by: Thinh Nguyen */ +UNUSUAL_DEV(0x154b, 0xf00b, 0x0000, 0x9999, + "PNY", + "Pro Elite SSD", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_NO_ATA_1X), + /* Reported-by: Thinh Nguyen */ UNUSUAL_DEV(0x154b, 0xf00d, 0x0000, 0x9999, "PNY", From 45ba7b195a369f35cb39094fdb32efe5908b34ad Mon Sep 17 00:00:00 2001 From: Shannon Zhao Date: Mon, 4 Jan 2021 19:38:44 +0800 Subject: [PATCH 350/547] arm64: cpufeature: remove non-exist CONFIG_KVM_ARM_HOST Commit d82755b2e781 ("KVM: arm64: Kill off CONFIG_KVM_ARM_HOST") deletes CONFIG_KVM_ARM_HOST option, it should use CONFIG_KVM instead. Just remove CONFIG_KVM_ARM_HOST here. Fixes: d82755b2e781 ("KVM: arm64: Kill off CONFIG_KVM_ARM_HOST") Signed-off-by: Shannon Zhao Acked-by: Catalin Marinas Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1609760324-92271-1-git-send-email-shannon.zhao@linux.alibaba.com --- arch/arm64/kernel/cpufeature.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index d96f4554282d..bc3549663957 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2558,7 +2558,7 @@ static void verify_hyp_capabilities(void) int parange, ipa_max; unsigned int safe_vmid_bits, vmid_bits; - if (!IS_ENABLED(CONFIG_KVM) || !IS_ENABLED(CONFIG_KVM_ARM_HOST)) + if (!IS_ENABLED(CONFIG_KVM)) return; safe_mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); From c9c48bb701ba78df7d4652146b12bcf3ad716507 Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Tue, 22 Dec 2020 02:47:56 +0100 Subject: [PATCH 351/547] speakup: Add github repository URL and bug tracker We have set up a repository for users to try newer releases more easily, and keep records of known bugs. Signed-off-by: Samuel Thibault Link: https://lore.kernel.org/r/20201222014756.ov5vi6fywylbp5n6@function Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 546aa66428c9..62934e770646 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16710,6 +16710,8 @@ M: Samuel Thibault L: speakup@linux-speakup.org S: Odd Fixes W: http://www.linux-speakup.org/ +W: https://github.com/linux-speakup/speakup +B: https://github.com/linux-speakup/speakup/issues F: drivers/accessibility/speakup/ SPEAR CLOCK FRAMEWORK SUPPORT From f6bcb4c7f366905b66ce8ffca7190118244bb642 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 5 Jan 2021 14:42:29 +0300 Subject: [PATCH 352/547] regmap: debugfs: Fix a reversed if statement in regmap_debugfs_init() This code will leak "map->debugfs_name" because the if statement is reversed so it only frees NULL pointers instead of non-NULL. In fact the if statement is not required and should just be removed because kfree() accepts NULL pointers. Fixes: cffa4b2122f5 ("regmap: debugfs: Fix a memory leak when calling regmap_attach_dev") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/X/RQpfAwRdLg0GqQ@mwanda Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-debugfs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c index bf03cd343be2..ff2ee87987c7 100644 --- a/drivers/base/regmap/regmap-debugfs.c +++ b/drivers/base/regmap/regmap-debugfs.c @@ -594,9 +594,7 @@ void regmap_debugfs_init(struct regmap *map) } if (!strcmp(name, "dummy")) { - if (!map->debugfs_name) - kfree(map->debugfs_name); - + kfree(map->debugfs_name); map->debugfs_name = kasprintf(GFP_KERNEL, "dummy%d", dummy_index); if (!map->debugfs_name) From 3fb6819f411b5a89afb5726afafacf0c4b62844f Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Mon, 28 Dec 2020 15:05:08 +0800 Subject: [PATCH 353/547] arm64: traps: remove duplicate include statement asm/exception.h is included more than once. Remove the one that isn't necessary. Signed-off-by: Tian Tao Link: https://lore.kernel.org/r/1609139108-10819-1-git-send-email-tiantao6@hisilicon.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/traps.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 08156be75569..6895ce777e7f 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include From e2bba5f92354488c331b7821d873db7c388e31aa Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Wed, 30 Dec 2020 14:19:54 -0800 Subject: [PATCH 354/547] arm64: vdso: disable .eh_frame_hdr via /DISCARD/ instead of --no-eh-frame-hdr Currently with ld.lld we emit an empty .eh_frame_hdr section (and a corresponding program header) into the vDSO. With ld.bfd the section is not emitted but the program header is, with p_vaddr set to 0. This can lead to unwinders attempting to interpret the data at whichever location the program header happens to point to as an unwind info header. This happens to be mostly harmless as long as the byte at that location (interpreted as a version number) has a value other than 1, causing both libgcc and LLVM libunwind to ignore the section (in libunwind's case, after printing an error message to stderr), but it could lead to worse problems if the byte happened to be 1 or the program header points to non-readable memory (e.g. if the empty section was placed at a page boundary). Instead of disabling .eh_frame_hdr via --no-eh-frame-hdr (which also has the downside of being unsupported by older versions of GNU binutils), disable it by discarding the section, and stop emitting the program header that points to it. I understand that we intend to emit valid unwind info for the vDSO at some point. Once that happens this patch can be reverted. Signed-off-by: Peter Collingbourne Acked-by: Ard Biesheuvel Acked-by: Vincenzo Frascino Link: https://linux-review.googlesource.com/id/If745fd9cadcb31b4010acbf5693727fe111b0863 Link: https://lore.kernel.org/r/20201230221954.2007257-1-pcc@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso/Makefile | 3 +-- arch/arm64/kernel/vdso/vdso.lds.S | 5 +---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index a8f8e409e2bf..cd9c3fa25902 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -24,8 +24,7 @@ btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti # routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so # preparation in build-time C")). ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv \ - -Bsymbolic $(call ld-option, --no-eh-frame-hdr) --build-id=sha1 -n \ - $(btildflags-y) -T + -Bsymbolic --build-id=sha1 -n $(btildflags-y) -T ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index d808ad31e01f..61dbb4c838ef 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -40,9 +40,6 @@ SECTIONS PROVIDE (_etext = .); PROVIDE (etext = .); - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic .rodata : { *(.rodata*) } :text @@ -54,6 +51,7 @@ SECTIONS *(.note.GNU-stack) *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) + *(.eh_frame .eh_frame_hdr) } } @@ -66,7 +64,6 @@ PHDRS text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr PT_GNU_EH_FRAME; } /* From f34d93f30d6a72f6b15ba24b6994b746df0c30de Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 4 Jan 2021 12:15:41 +0000 Subject: [PATCH 355/547] arm64: kasan: Set TCR_EL1.TBID1 when KASAN_HW_TAGS is enabled Commit 49b3cf035edc ("kasan: arm64: set TCR_EL1.TBID1 when enabled") set the TBID1 bit for the KASAN_SW_TAGS configuration, freeing up 8 bits to be used by PAC. With in-kernel MTE now in mainline, also set this bit for the KASAN_HW_TAGS configuration. Signed-off-by: Catalin Marinas Cc: Peter Collingbourne Acked-by: Vincenzo Frascino Acked-by: Andrey Konovalov --- arch/arm64/mm/proc.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 37a54b57178a..1f7ee8c8b7b8 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -46,7 +46,7 @@ #endif #ifdef CONFIG_KASAN_HW_TAGS -#define TCR_KASAN_HW_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1 +#define TCR_KASAN_HW_FLAGS SYS_TCR_EL1_TCMA1 | TCR_TBI1 | TCR_TBID1 #else #define TCR_KASAN_HW_FLAGS 0 #endif From a8f7e08a81708920a928664a865208fdf451c49f Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Tue, 5 Jan 2021 08:33:11 -0800 Subject: [PATCH 356/547] x86/sev-es: Fix SEV-ES OUT/IN immediate opcode vc handling The IN and OUT instructions with port address as an immediate operand only use an 8-bit immediate (imm8). The current VC handler uses the entire 32-bit immediate value but these instructions only set the first bytes. Cast the operand to an u8 for that. [ bp: Massage commit message. ] Fixes: 25189d08e5168 ("x86/sev-es: Add support for handling IOIO exceptions") Signed-off-by: Peter Gonda Signed-off-by: Borislav Petkov Acked-by: David Rientjes Link: https://lkml.kernel.org/r/20210105163311.221490-1-pgonda@google.com --- arch/x86/kernel/sev-es-shared.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c index 7d04b356d44d..cdc04d091242 100644 --- a/arch/x86/kernel/sev-es-shared.c +++ b/arch/x86/kernel/sev-es-shared.c @@ -305,14 +305,14 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) case 0xe4: case 0xe5: *exitinfo |= IOIO_TYPE_IN; - *exitinfo |= (u64)insn->immediate.value << 16; + *exitinfo |= (u8)insn->immediate.value << 16; break; /* OUT immediate opcodes */ case 0xe6: case 0xe7: *exitinfo |= IOIO_TYPE_OUT; - *exitinfo |= (u64)insn->immediate.value << 16; + *exitinfo |= (u8)insn->immediate.value << 16; break; /* IN register opcodes */ From d16baa3f1453c14d680c5fee01cd122a22d0e0ce Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Jan 2021 12:37:23 -0500 Subject: [PATCH 357/547] blk-iocost: fix NULL iocg deref from racing against initialization When initializing iocost for a queue, its rqos should be registered before the blkcg policy is activated to allow policy data initiailization to lookup the associated ioc. This unfortunately means that the rqos methods can be called on bios before iocgs are attached to all existing blkgs. While the race is theoretically possible on ioc_rqos_throttle(), it mostly happened in ioc_rqos_merge() due to the difference in how they lookup ioc. The former determines it from the passed in @rqos and then bails before dereferencing iocg if the looked up ioc is disabled, which most likely is the case if initialization is still in progress. The latter looked up ioc by dereferencing the possibly NULL iocg making it a lot more prone to actually triggering the bug. * Make ioc_rqos_merge() use the same method as ioc_rqos_throttle() to look up ioc for consistency. * Make ioc_rqos_throttle() and ioc_rqos_merge() test for NULL iocg before dereferencing it. * Explain the danger of NULL iocgs in blk_iocost_init(). Signed-off-by: Tejun Heo Reported-by: Jonathan Lemon Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Jens Axboe --- block/blk-iocost.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ac6078a34939..98d656bdb42b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2551,8 +2551,8 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) bool use_debt, ioc_locked; unsigned long flags; - /* bypass IOs if disabled or for root cgroup */ - if (!ioc->enabled || !iocg->level) + /* bypass IOs if disabled, still initializing, or for root cgroup */ + if (!ioc->enabled || !iocg || !iocg->level) return; /* calculate the absolute vtime cost */ @@ -2679,14 +2679,14 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); - struct ioc *ioc = iocg->ioc; + struct ioc *ioc = rqos_to_ioc(rqos); sector_t bio_end = bio_end_sector(bio); struct ioc_now now; u64 vtime, abs_cost, cost; unsigned long flags; - /* bypass if disabled or for root cgroup */ - if (!ioc->enabled || !iocg->level) + /* bypass if disabled, still initializing, or for root cgroup */ + if (!ioc->enabled || !iocg || !iocg->level) return; abs_cost = calc_vtime_cost(bio, iocg, true); @@ -2863,6 +2863,12 @@ static int blk_iocost_init(struct request_queue *q) ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); + /* + * rqos must be added before activation to allow iocg_pd_init() to + * lookup the ioc from q. This means that the rqos methods may get + * called before policy activation completion, can't assume that the + * target bio has an iocg associated and need to test for NULL iocg. + */ rq_qos_add(q, rqos); ret = blkcg_activate_policy(q, &blkcg_policy_iocost); if (ret) { From 6d4d273588378c65915acaf7b2ee74e9dd9c130a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 10 Dec 2020 10:44:33 +0100 Subject: [PATCH 358/547] bfq: Fix computation of shallow depth BFQ computes number of tags it allows to be allocated for each request type based on tag bitmap. However it uses 1 << bitmap.shift as number of available tags which is wrong. 'shift' is just an internal bitmap value containing logarithm of how many bits bitmap uses in each bitmap word. Thus number of tags allowed for some request types can be far to low. Use proper bitmap.depth which has the number of tags instead. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9e81d1052091..9e4eb0fc1c16 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6332,13 +6332,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, * limit 'something'. */ /* no more than 50% of tags for async I/O */ - bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U); + bfqd->word_depths[0][0] = max(bt->sb.depth >> 1, 1U); /* * no more than 75% of tags for sync writes (25% extra tags * w.r.t. async I/O, to prevent async I/O from starving sync * writes) */ - bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U); + bfqd->word_depths[0][1] = max((bt->sb.depth * 3) >> 2, 1U); /* * In-word depths in case some bfq_queue is being weight- @@ -6348,9 +6348,9 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, * shortage. */ /* no more than ~18% of tags for async I/O */ - bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U); + bfqd->word_depths[1][0] = max((bt->sb.depth * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ - bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U); + bfqd->word_depths[1][1] = max((bt->sb.depth * 6) >> 4, 1U); for (i = 0; i < 2; i++) for (j = 0; j < 2; j++) From 170b3bbda08852277b97f4f0516df0785c939764 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Tue, 5 Jan 2021 21:53:40 +0800 Subject: [PATCH 359/547] =?UTF-8?q?io=5Furing:=20Delete=20useless=20variab?= =?UTF-8?q?le=20=E2=80=98id=E2=80=99=20in=20io=5Fprep=5Fasync=5Fwork?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix follow warning: fs/io_uring.c:1523:22: warning: variable ‘id’ set but not used [-Wunused-but-set-variable] struct io_identity *id; ^~ Reported-by: Hulk Robot Signed-off-by: Ye Bin Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5bccb235271f..dc92ca5090a3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1523,10 +1523,8 @@ static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; - struct io_identity *id; io_req_init_async(req); - id = req->work.identity; if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; From aebf5db917055b38f4945ed6d621d9f07a44ff30 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 21 Dec 2020 12:33:35 +0800 Subject: [PATCH 360/547] block: fix use-after-free in disk_part_iter_next Make sure that bdgrab() is done on the 'block_device' instance before referring to it for avoiding use-after-free. Cc: Reported-by: syzbot+825f0f9657d4e528046e@syzkaller.appspotmail.com Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 73faec438e49..419548e92d82 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -246,15 +246,18 @@ struct block_device *disk_part_iter_next(struct disk_part_iter *piter) part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!bdev_nr_sectors(part) && - !(piter->flags & DISK_PITER_INCL_EMPTY) && - !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && - piter->idx == 0)) - continue; - piter->part = bdgrab(part); if (!piter->part) continue; + if (!bdev_nr_sectors(part) && + !(piter->flags & DISK_PITER_INCL_EMPTY) && + !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && + piter->idx == 0)) { + bdput(piter->part); + piter->part = NULL; + continue; + } + piter->idx += inc; break; } From 6775ae901ffd130d0be9c32837f88d1f9d560189 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 22 Dec 2020 17:42:32 +0100 Subject: [PATCH 361/547] iommu/iova: fix 'domain' typos Replace misspelled 'doamin' with 'domain' in several comments. Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20201222164232.88795-1-sgarzare@redhat.com Signed-off-by: Will Deacon --- drivers/iommu/iova.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 4bb3293ae4d7..d20b8b333d30 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -358,7 +358,7 @@ static void private_free_iova(struct iova_domain *iovad, struct iova *iova) * @iovad: - iova domain in question. * @pfn: - page frame number * This function finds and returns an iova belonging to the - * given doamin which matches the given pfn. + * given domain which matches the given pfn. */ struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn) { @@ -601,7 +601,7 @@ void queue_iova(struct iova_domain *iovad, EXPORT_SYMBOL_GPL(queue_iova); /** - * put_iova_domain - destroys the iova doamin + * put_iova_domain - destroys the iova domain * @iovad: - iova domain in question. * All the iova's in that domain are destroyed. */ @@ -712,9 +712,9 @@ EXPORT_SYMBOL_GPL(reserve_iova); /** * copy_reserved_iova - copies the reserved between domains - * @from: - source doamin from where to copy + * @from: - source domain from where to copy * @to: - destination domin where to copy - * This function copies reserved iova's from one doamin to + * This function copies reserved iova's from one domain to * other. */ void From ff2b46d7cff80d27d82f7f3252711f4ca1666129 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Tue, 5 Jan 2021 13:18:37 +0800 Subject: [PATCH 362/547] iommu/intel: Fix memleak in intel_irq_remapping_alloc When irq_domain_get_irq_data() or irqd_cfg() fails at i == 0, data allocated by kzalloc() has not been freed before returning, which leads to memleak. Fixes: b106ee63abcc ("irq_remapping/vt-d: Enhance Intel IR driver to support hierarchical irqdomains") Signed-off-by: Dinghao Liu Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20210105051837.32118-1-dinghao.liu@zju.edu.cn Signed-off-by: Will Deacon --- drivers/iommu/intel/irq_remapping.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index aeffda92b10b..685200a5cff0 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1353,6 +1353,8 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain, irq_data = irq_domain_get_irq_data(domain, virq + i); irq_cfg = irqd_cfg(irq_data); if (!irq_data || !irq_cfg) { + if (!i) + kfree(data); ret = -EINVAL; goto out_free_data; } From 12bc4570c14e24e6244d66466aeda994f805634b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 5 Jan 2021 01:32:51 +0000 Subject: [PATCH 363/547] iommu/amd: Set iommu->int_enabled consistently when interrupts are set up When I made the INTCAPXT support stop gratuitously pretending to be MSI, I missed the fact that iommu_setup_msi() also sets the ->int_enabled flag. I missed this in the iommu_setup_intcapxt() code path, which means that a resume from suspend will try to allocate the IRQ domains again, accidentally re-enabling interrupts as it does, resulting in much sadness. Lift out the bit which sets iommu->int_enabled into the iommu_init_irq() function which is also where it gets checked. Link: https://lore.kernel.org/r/20210104132250.GE32151@zn.tnic/ Fixes: d1adcfbb520c ("iommu/amd: Fix IOMMU interrupt generation in X2APIC mode") Reported-by: Borislav Petkov Signed-off-by: David Woodhouse Tested-by: Borislav Petkov Link: https://lore.kernel.org/r/50cd5f55be8ead0937ac315cd2f5b89364f6a9a5.camel@infradead.org Signed-off-by: Will Deacon --- drivers/iommu/amd/init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index f54cd79b43e4..6a1f7048dacc 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1973,8 +1973,6 @@ static int iommu_setup_msi(struct amd_iommu *iommu) return r; } - iommu->int_enabled = true; - return 0; } @@ -2169,6 +2167,7 @@ static int iommu_init_irq(struct amd_iommu *iommu) if (ret) return ret; + iommu->int_enabled = true; enable_faults: iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); From b34f10c2dc5961021850c3c15f46a84b56a0c0e8 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 5 Jan 2021 01:36:13 +0000 Subject: [PATCH 364/547] iommu/amd: Stop irq_remapping_select() matching when remapping is disabled The AMD IOMMU initialisation registers the IRQ remapping domain for each IOMMU before doing the final sanity check that every I/OAPIC is covered. This means that the AMD irq_remapping_select() function gets invoked even when IRQ remapping has been disabled, eventually leading to a NULL pointer dereference in alloc_irq_table(). Unfortunately, the IVRS isn't fully parsed early enough that the sanity check can be done in time to registering the IRQ domain altogether. Doing that would be nice, but is a larger and more error-prone task. The simple fix is just for irq_remapping_select() to refuse to report a match when IRQ remapping has disabled. Link: https://lore.kernel.org/lkml/ed4be9b4-24ac-7128-c522-7ef359e8185d@gmx.at Fixes: a1a785b57242 ("iommu/amd: Implement select() method on remapping irqdomain") Reported-by: Johnathan Smithinovic Signed-off-by: David Woodhouse Link: https://lore.kernel.org/r/04bbe8bca87f81a3cfa93ec4299e53f47e00e5b3.camel@infradead.org Signed-off-by: Will Deacon --- drivers/iommu/amd/iommu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 7e2c445a1fae..f0adbc48fd17 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3854,6 +3854,9 @@ static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec, struct amd_iommu *iommu; int devid = -1; + if (!amd_iommu_irq_remap) + return 0; + if (x86_fwspec_is_ioapic(fwspec)) devid = get_ioapic_devid(fwspec->param[0]); else if (x86_fwspec_is_hpet(fwspec)) From c2407cf7d22d0c0d94cf20342b3b8f06f1d904e7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 5 Jan 2021 11:33:00 -0800 Subject: [PATCH 365/547] mm: make wait_on_page_writeback() wait for multiple pending writebacks Ever since commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic") we've had some very occasional reports of BUG_ON(PageWriteback) in write_cache_pages(), which we thought we already fixed in commit 073861ed77b6 ("mm: fix VM_BUG_ON(PageTail) and BUG_ON(PageWriteback)"). But syzbot just reported another one, even with that commit in place. And it turns out that there's a simpler way to trigger the BUG_ON() than the one Hugh found with page re-use. It all boils down to the fact that the page writeback is ostensibly serialized by the page lock, but that isn't actually really true. Yes, the people _setting_ writeback all do so under the page lock, but the actual clearing of the bit - and waking up any waiters - happens without any page lock. This gives us this fairly simple race condition: CPU1 = end previous writeback CPU2 = start new writeback under page lock CPU3 = write_cache_pages() CPU1 CPU2 CPU3 ---- ---- ---- end_page_writeback() test_clear_page_writeback(page) ... delayed... lock_page(); set_page_writeback() unlock_page() lock_page() wait_on_page_writeback(); wake_up_page(page, PG_writeback); .. wakes up CPU3 .. BUG_ON(PageWriteback(page)); where the BUG_ON() happens because we woke up the PG_writeback bit becasue of the _previous_ writeback, but a new one had already been started because the clearing of the bit wasn't actually atomic wrt the actual wakeup or serialized by the page lock. The reason this didn't use to happen was that the old logic in waiting on a page bit would just loop if it ever saw the bit set again. The nice proper fix would probably be to get rid of the whole "wait for writeback to clear, and then set it" logic in the writeback path, and replace it with an atomic "wait-to-set" (ie the same as we have for page locking: we set the page lock bit with a single "lock_page()", not with "wait for lock bit to clear and then set it"). However, out current model for writeback is that the waiting for the writeback bit is done by the generic VFS code (ie write_cache_pages()), but the actual setting of the writeback bit is done much later by the filesystem ".writepages()" function. IOW, to make the writeback bit have that same kind of "wait-to-set" behavior as we have for page locking, we'd have to change our roughly ~50 different writeback functions. Painful. Instead, just make "wait_on_page_writeback()" loop on the very unlikely situation that the PG_writeback bit is still set, basically re-instating the old behavior. This is very non-optimal in case of contention, but since we only ever set the bit under the page lock, that situation is controlled. Reported-by: syzbot+2fc0712f8f8b8b8fa0ef@syzkaller.appspotmail.com Fixes: 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic") Acked-by: Hugh Dickins Cc: Andrew Morton Cc: Matthew Wilcox Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 586042472ac9..eb34d204d4ee 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2826,7 +2826,7 @@ EXPORT_SYMBOL(__test_set_page_writeback); */ void wait_on_page_writeback(struct page *page) { - if (PageWriteback(page)) { + while (PageWriteback(page)) { trace_wait_on_page_writeback(page, page_mapping(page)); wait_on_page_bit(page, PG_writeback); } From 8a48c0a3360bf2bf4f40c980d0ec216e770e58ee Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Jan 2021 19:44:53 -0800 Subject: [PATCH 366/547] arch/arc: add copy_user_page() to to fix build error on ARC fs/dax.c uses copy_user_page() but ARC does not provide that interface, resulting in a build error. Provide copy_user_page() in . ../fs/dax.c: In function 'copy_cow_page_dax': ../fs/dax.c:702:2: error: implicit declaration of function 'copy_user_page'; did you mean 'copy_to_user_page'? [-Werror=implicit-function-declaration] Reported-by: kernel test robot Signed-off-by: Randy Dunlap Cc: Vineet Gupta Cc: linux-snps-arc@lists.infradead.org Cc: Dan Williams #Acked-by: Vineet Gupta # v1 Cc: Andrew Morton Cc: Matthew Wilcox Cc: Jan Kara Cc: linux-fsdevel@vger.kernel.org Cc: linux-nvdimm@lists.01.org #Reviewed-by: Ira Weiny # v2 Signed-off-by: Vineet Gupta --- arch/arc/include/asm/page.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index 23e41e890eda..ad9b7fe4dba3 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -10,6 +10,7 @@ #ifndef __ASSEMBLY__ #define clear_page(paddr) memset((paddr), 0, PAGE_SIZE) +#define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE) struct vm_area_struct; From f4d9359de8ac0fb64a5ecc9c34833705eb53327b Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 5 Nov 2020 13:22:10 -0800 Subject: [PATCH 367/547] include/soc: remove headers for EZChip NPS NPS platform has been removed from ARC port and there are no in-tree user of it now . So RIP ! Signed-off-by: Vineet Gupta --- include/soc/nps/common.h | 172 --------------------------------------- include/soc/nps/mtm.h | 59 -------------- 2 files changed, 231 deletions(-) delete mode 100644 include/soc/nps/common.h delete mode 100644 include/soc/nps/mtm.h diff --git a/include/soc/nps/common.h b/include/soc/nps/common.h deleted file mode 100644 index 8c18dc6d3fde..000000000000 --- a/include/soc/nps/common.h +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) 2016, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef SOC_NPS_COMMON_H -#define SOC_NPS_COMMON_H - -#ifdef CONFIG_SMP -#define NPS_IPI_IRQ 5 -#endif - -#define NPS_HOST_REG_BASE 0xF6000000 - -#define NPS_MSU_BLKID 0x018 - -#define CTOP_INST_RSPI_GIC_0_R12 0x3C56117E -#define CTOP_INST_MOV2B_FLIP_R3_B1_B2_INST 0x5B60 -#define CTOP_INST_MOV2B_FLIP_R3_B1_B2_LIMM 0x00010422 - -#ifndef AUX_IENABLE -#define AUX_IENABLE 0x40c -#endif - -#define CTOP_AUX_IACK (0xFFFFF800 + 0x088) - -#ifndef __ASSEMBLY__ - -/* In order to increase compilation test coverage */ -#ifdef CONFIG_ARC -static inline void nps_ack_gic(void) -{ - __asm__ __volatile__ ( - " .word %0\n" - : - : "i"(CTOP_INST_RSPI_GIC_0_R12) - : "memory"); -} -#else -static inline void nps_ack_gic(void) { } -#define write_aux_reg(r, v) -#define read_aux_reg(r) 0 -#endif - -/* CPU global ID */ -struct global_id { - union { - struct { -#ifdef CONFIG_EZNPS_MTM_EXT - u32 __reserved:20, cluster:4, core:4, thread:4; -#else - u32 __reserved:24, cluster:4, core:4; -#endif - }; - u32 value; - }; -}; - -/* - * Convert logical to physical CPU IDs - * - * The conversion swap bits 1 and 2 of cluster id (out of 4 bits) - * Now quad of logical clusters id's are adjacent physically, - * and not like the id's physically came with each cluster. - * Below table is 4x4 mesh of core clusters as it layout on chip. - * Cluster ids are in format: logical (physical) - * - * ----------------- ------------------ - * 3 | 5 (3) 7 (7) | | 13 (11) 15 (15)| - * - * 2 | 4 (2) 6 (6) | | 12 (10) 14 (14)| - * ----------------- ------------------ - * 1 | 1 (1) 3 (5) | | 9 (9) 11 (13)| - * - * 0 | 0 (0) 2 (4) | | 8 (8) 10 (12)| - * ----------------- ------------------ - * 0 1 2 3 - */ -static inline int nps_cluster_logic_to_phys(int cluster) -{ -#ifdef __arc__ - __asm__ __volatile__( - " mov r3,%0\n" - " .short %1\n" - " .word %2\n" - " mov %0,r3\n" - : "+r"(cluster) - : "i"(CTOP_INST_MOV2B_FLIP_R3_B1_B2_INST), - "i"(CTOP_INST_MOV2B_FLIP_R3_B1_B2_LIMM) - : "r3"); -#endif - - return cluster; -} - -#define NPS_CPU_TO_CLUSTER_NUM(cpu) \ - ({ struct global_id gid; gid.value = cpu; \ - nps_cluster_logic_to_phys(gid.cluster); }) - -struct nps_host_reg_address { - union { - struct { - u32 base:8, cl_x:4, cl_y:4, - blkid:6, reg:8, __reserved:2; - }; - u32 value; - }; -}; - -struct nps_host_reg_address_non_cl { - union { - struct { - u32 base:7, blkid:11, reg:12, __reserved:2; - }; - u32 value; - }; -}; - -static inline void *nps_host_reg_non_cl(u32 blkid, u32 reg) -{ - struct nps_host_reg_address_non_cl reg_address; - - reg_address.value = NPS_HOST_REG_BASE; - reg_address.blkid = blkid; - reg_address.reg = reg; - - return (void *)reg_address.value; -} - -static inline void *nps_host_reg(u32 cpu, u32 blkid, u32 reg) -{ - struct nps_host_reg_address reg_address; - u32 cl = NPS_CPU_TO_CLUSTER_NUM(cpu); - - reg_address.value = NPS_HOST_REG_BASE; - reg_address.cl_x = (cl >> 2) & 0x3; - reg_address.cl_y = cl & 0x3; - reg_address.blkid = blkid; - reg_address.reg = reg; - - return (void *)reg_address.value; -} -#endif /* __ASSEMBLY__ */ - -#endif /* SOC_NPS_COMMON_H */ diff --git a/include/soc/nps/mtm.h b/include/soc/nps/mtm.h deleted file mode 100644 index d2f5e7e3703e..000000000000 --- a/include/soc/nps/mtm.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2016, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef SOC_NPS_MTM_H -#define SOC_NPS_MTM_H - -#define CTOP_INST_HWSCHD_OFF_R3 0x3B6F00BF -#define CTOP_INST_HWSCHD_RESTORE_R3 0x3E6F70C3 - -static inline void hw_schd_save(unsigned int *flags) -{ - __asm__ __volatile__( - " .word %1\n" - " st r3,[%0]\n" - : - : "r"(flags), "i"(CTOP_INST_HWSCHD_OFF_R3) - : "r3", "memory"); -} - -static inline void hw_schd_restore(unsigned int flags) -{ - __asm__ __volatile__( - " mov r3, %0\n" - " .word %1\n" - : - : "r"(flags), "i"(CTOP_INST_HWSCHD_RESTORE_R3) - : "r3"); -} - -#endif /* SOC_NPS_MTM_H */ From 2860d45a589818dd8ffd90cdc4bcf77f36a5a6be Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 22:36:17 +0100 Subject: [PATCH 368/547] qed: select CONFIG_CRC32 Without this, the driver fails to link: lpc_eth.c:(.text+0x1934): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/qlogic/qed/qed_debug.o: in function `qed_grc_dump': qed_debug.c:(.text+0x4068): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/qlogic/qed/qed_debug.o: in function `qed_idle_chk_dump': qed_debug.c:(.text+0x51fc): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/qlogic/qed/qed_debug.o: in function `qed_mcp_trace_dump': qed_debug.c:(.text+0x6000): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/qlogic/qed/qed_debug.o: in function `qed_dbg_reg_fifo_dump': qed_debug.c:(.text+0x66cc): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/ethernet/qlogic/qed/qed_debug.o:qed_debug.c:(.text+0x6aa4): more undefined references to `crc32_le' follow Fixes: 7a4b21b7d1f0 ("qed: Add nvram selftest") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/Kconfig b/drivers/net/ethernet/qlogic/Kconfig index 4366c7a8de95..6b5ddb07ee83 100644 --- a/drivers/net/ethernet/qlogic/Kconfig +++ b/drivers/net/ethernet/qlogic/Kconfig @@ -78,6 +78,7 @@ config QED depends on PCI select ZLIB_INFLATE select CRC8 + select CRC32 select NET_DEVLINK help This enables the support for Marvell FastLinQ adapters family. From f9d6f94132f01d2a552dcbab54fa56496638186d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 22:36:18 +0100 Subject: [PATCH 369/547] phy: dp83640: select CONFIG_CRC32 Without crc32, this driver fails to link: arm-linux-gnueabi-ld: drivers/net/phy/dp83640.o: in function `match': dp83640.c:(.text+0x476c): undefined reference to `crc32_le' Fixes: 539e44d26855 ("dp83640: Include hash in timestamp/packet matching") Signed-off-by: Arnd Bergmann Reviewed-by: Andrew Lunn Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index 476d7c7fe70a..d2bf05ccbbe2 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -64,6 +64,7 @@ config DP83640_PHY depends on NETWORK_PHY_TIMESTAMPING depends on PHYLIB depends on PTP_1588_CLOCK + select CRC32 help Supports the DP83640 PHYTER with IEEE 1588 features. From 1d48595c786b1b9dc6be301e8d7f6fc74e9882aa Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 22:36:19 +0100 Subject: [PATCH 370/547] can: kvaser_pciefd: select CONFIG_CRC32 Without crc32, this driver fails to link: arm-linux-gnueabi-ld: drivers/net/can/kvaser_pciefd.o: in function `kvaser_pciefd_probe': kvaser_pciefd.c:(.text+0x2b0): undefined reference to `crc32_be' Fixes: 26ad340e582d ("can: kvaser_pciefd: Add driver for Kvaser PCIEcan devices") Signed-off-by: Arnd Bergmann Acked-by: Marc Kleine-Budde Signed-off-by: David S. Miller --- drivers/net/can/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig index 424970939fd4..1c28eade6bec 100644 --- a/drivers/net/can/Kconfig +++ b/drivers/net/can/Kconfig @@ -123,6 +123,7 @@ config CAN_JANZ_ICAN3 config CAN_KVASER_PCIEFD depends on PCI tristate "Kvaser PCIe FD cards" + select CRC32 help This is a driver for the Kvaser PCI Express CAN FD family. From e186620d7bf11b274b985b839c38266d7918cc05 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 22:36:20 +0100 Subject: [PATCH 371/547] wil6210: select CONFIG_CRC32 Without crc32, the driver fails to link: arm-linux-gnueabi-ld: drivers/net/wireless/ath/wil6210/fw.o: in function `wil_fw_verify': fw.c:(.text+0x74c): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/net/wireless/ath/wil6210/fw.o:fw.c:(.text+0x758): more undefined references to `crc32_le' follow Fixes: 151a9706503f ("wil6210: firmware download") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/wireless/ath/wil6210/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ath/wil6210/Kconfig b/drivers/net/wireless/ath/wil6210/Kconfig index 6a95b199bf62..f074e9c31aa2 100644 --- a/drivers/net/wireless/ath/wil6210/Kconfig +++ b/drivers/net/wireless/ath/wil6210/Kconfig @@ -2,6 +2,7 @@ config WIL6210 tristate "Wilocity 60g WiFi card wil6210 support" select WANT_DEV_COREDUMP + select CRC32 depends on CFG80211 depends on PCI default n From 152a8a6c017bfdeda7f6d052fbc6e151891bd9b6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 22:36:21 +0100 Subject: [PATCH 372/547] cfg80211: select CONFIG_CRC32 Without crc32 support, this fails to link: arm-linux-gnueabi-ld: net/wireless/scan.o: in function `cfg80211_scan_6ghz': scan.c:(.text+0x928): undefined reference to `crc32_le' Fixes: c8cb5b854b40 ("nl80211/cfg80211: support 6 GHz scanning") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/wireless/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 27026f587fa6..f620acd2a0f5 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -21,6 +21,7 @@ config CFG80211 tristate "cfg80211 - wireless configuration API" depends on RFKILL || !RFKILL select FW_LOADER + select CRC32 # may need to update this when certificates are changed and are # using a different algorithm, though right now they shouldn't # (this is here rather than below to allow it to be a module) From 51049bd903a81307f751babe15a1df8d197884e8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 22:36:22 +0100 Subject: [PATCH 373/547] misdn: dsp: select CONFIG_BITREVERSE Without this, we run into a link error arm-linux-gnueabi-ld: drivers/isdn/mISDN/dsp_audio.o: in function `dsp_audio_generate_law_tables': (.text+0x30c): undefined reference to `byte_rev_table' arm-linux-gnueabi-ld: drivers/isdn/mISDN/dsp_audio.o:(.text+0x5e4): more undefined references to `byte_rev_table' follow Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/isdn/mISDN/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/isdn/mISDN/Kconfig b/drivers/isdn/mISDN/Kconfig index 26cf0ac9c4ad..c9a53c222472 100644 --- a/drivers/isdn/mISDN/Kconfig +++ b/drivers/isdn/mISDN/Kconfig @@ -13,6 +13,7 @@ if MISDN != n config MISDN_DSP tristate "Digital Audio Processing of transparent data" depends on MISDN + select BITREVERSE help Enable support for digital audio processing capability. From 69931e11288520c250152180ecf9b6ac5e6e40ed Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 22:36:23 +0100 Subject: [PATCH 374/547] wan: ds26522: select CONFIG_BITREVERSE Without this, the driver runs into a link failure arm-linux-gnueabi-ld: drivers/net/wan/slic_ds26522.o: in function `slic_ds26522_probe': slic_ds26522.c:(.text+0x100c): undefined reference to `byte_rev_table' arm-linux-gnueabi-ld: slic_ds26522.c:(.text+0x1cdc): undefined reference to `byte_rev_table' arm-linux-gnueabi-ld: drivers/net/wan/slic_ds26522.o: in function `slic_write': slic_ds26522.c:(.text+0x1e4c): undefined reference to `byte_rev_table' Fixes: c37d4a0085c5 ("Maxim/driver: Add driver for maxim ds26522") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/wan/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wan/Kconfig b/drivers/net/wan/Kconfig index 4029fde71a9e..83c9481995dd 100644 --- a/drivers/net/wan/Kconfig +++ b/drivers/net/wan/Kconfig @@ -282,6 +282,7 @@ config SLIC_DS26522 tristate "Slic Maxim ds26522 card support" depends on SPI depends on FSL_SOC || ARCH_MXC || ARCH_LAYERSCAPE || COMPILE_TEST + select BITREVERSE help This module initializes and configures the slic maxim card in T1 or E1 mode. From 0f7ba7bc46fa0b574ccacf5672991b321e028492 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 3 Jan 2021 11:26:26 +1100 Subject: [PATCH 375/547] net/sonic: Fix some resource leaks in error handling paths A call to dma_alloc_coherent() is wrapped by sonic_alloc_descriptors(). This is correctly freed in the remove function, but not in the error handling path of the probe function. Fix this by adding the missing dma_free_coherent() call. While at it, rename a label in order to be slightly more informative. Cc: Christophe JAILLET Cc: Thomas Bogendoerfer Cc: Chris Zankel References: commit 10e3cc180e64 ("net/sonic: Fix a resource leak in an error handling path in 'jazz_sonic_probe()'") Fixes: 74f2a5f0ef64 ("xtensa: Add support for the Sonic Ethernet device for the XT2000 board.") Fixes: efcce839360f ("[PATCH] macsonic/jazzsonic network drivers update") Signed-off-by: Christophe JAILLET Signed-off-by: Finn Thain Signed-off-by: David S. Miller --- drivers/net/ethernet/natsemi/macsonic.c | 12 ++++++++++-- drivers/net/ethernet/natsemi/xtsonic.c | 7 +++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/natsemi/macsonic.c b/drivers/net/ethernet/natsemi/macsonic.c index 776b7d264dc3..2289e1fe3741 100644 --- a/drivers/net/ethernet/natsemi/macsonic.c +++ b/drivers/net/ethernet/natsemi/macsonic.c @@ -506,10 +506,14 @@ static int mac_sonic_platform_probe(struct platform_device *pdev) err = register_netdev(dev); if (err) - goto out; + goto undo_probe; return 0; +undo_probe: + dma_free_coherent(lp->device, + SIZEOF_SONIC_DESC * SONIC_BUS_SCALE(lp->dma_bitmode), + lp->descriptors, lp->descriptors_laddr); out: free_netdev(dev); @@ -584,12 +588,16 @@ static int mac_sonic_nubus_probe(struct nubus_board *board) err = register_netdev(ndev); if (err) - goto out; + goto undo_probe; nubus_set_drvdata(board, ndev); return 0; +undo_probe: + dma_free_coherent(lp->device, + SIZEOF_SONIC_DESC * SONIC_BUS_SCALE(lp->dma_bitmode), + lp->descriptors, lp->descriptors_laddr); out: free_netdev(ndev); return err; diff --git a/drivers/net/ethernet/natsemi/xtsonic.c b/drivers/net/ethernet/natsemi/xtsonic.c index afa166ff7aef..28d9e98db81a 100644 --- a/drivers/net/ethernet/natsemi/xtsonic.c +++ b/drivers/net/ethernet/natsemi/xtsonic.c @@ -229,11 +229,14 @@ int xtsonic_probe(struct platform_device *pdev) sonic_msg_init(dev); if ((err = register_netdev(dev))) - goto out1; + goto undo_probe1; return 0; -out1: +undo_probe1: + dma_free_coherent(lp->device, + SIZEOF_SONIC_DESC * SONIC_BUS_SCALE(lp->dma_bitmode), + lp->descriptors, lp->descriptors_laddr); release_region(dev->base_addr, SONIC_MEM_SIZE); out: free_netdev(dev); From cf0720697143f3eaa0779cca5a6602d8557d1c6f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 30 Dec 2020 19:37:53 -0800 Subject: [PATCH 376/547] net: suggest L2 discards be counted towards rx_dropped From the existing definitions it's unclear which stat to use to report filtering based on L2 dst addr in old broadcast-medium Ethernet. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 874cc12a34d9..82708c6db432 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -75,8 +75,9 @@ struct rtnl_link_stats { * * @rx_dropped: Number of packets received but not processed, * e.g. due to lack of resources or unsupported protocol. - * For hardware interfaces this counter should not include packets - * dropped by the device which are counted separately in + * For hardware interfaces this counter may include packets discarded + * due to L2 address filtering but should not include packets dropped + * by the device due to buffer exhaustion which are counted separately in * @rx_missed_errors (since procfs folds those two counters together). * * @tx_dropped: Number of packets dropped on their way to transmission, From 55b7ab1178cbf41f979ff83236d3321ad35ed2ad Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 30 Dec 2020 19:40:27 -0800 Subject: [PATCH 377/547] net: vlan: avoid leaks on register_vlan_dev() failures VLAN checks for NETREG_UNINITIALIZED to distinguish between registration failure and unregistration in progress. Since commit cb626bf566eb ("net-sysfs: Fix reference count leak") registration failure may, however, result in NETREG_UNREGISTERED as well as NETREG_UNINITIALIZED. This fix is similer to cebb69754f37 ("rtnetlink: Fix memory(net_device) leak when ->newlink fails") Fixes: cb626bf566eb ("net-sysfs: Fix reference count leak") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/8021q/vlan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index f292e0267bb9..15bbfaf943fd 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -284,7 +284,8 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) return 0; out_free_newdev: - if (new_dev->reg_state == NETREG_UNINITIALIZED) + if (new_dev->reg_state == NETREG_UNINITIALIZED || + new_dev->reg_state == NETREG_UNREGISTERED) free_netdev(new_dev); return err; } From 7eeecc4b1f480c7ba1932cb9a7693f8c452640f2 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sun, 3 Jan 2021 05:17:41 -0600 Subject: [PATCH 378/547] net: stmmac: dwmac-sun8i: Fix probe error handling stmmac_pltfr_remove does three things in one function, making it inapproprate for unwinding the steps in the probe function. Currently, a failure before the call to stmmac_dvr_probe would leak OF node references due to missing a call to stmmac_remove_config_dt. And an error in stmmac_dvr_probe would cause the driver to attempt to remove a netdevice that was never added. Fix these by reordering the init and splitting out the error handling steps. Fixes: 9f93ac8d4085 ("net-next: stmmac: Add dwmac-sun8i") Fixes: 40a1dcee2d18 ("net: ethernet: dwmac-sun8i: Use the correct function in exit path") Signed-off-by: Samuel Holland Reviewed-by: Chen-Yu Tsai Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 58e0511badba..b20f261fce5b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1134,10 +1134,6 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); - if (IS_ERR(plat_dat)) - return PTR_ERR(plat_dat); - gmac = devm_kzalloc(dev, sizeof(*gmac), GFP_KERNEL); if (!gmac) return -ENOMEM; @@ -1201,11 +1197,15 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) ret = of_get_phy_mode(dev->of_node, &interface); if (ret) return -EINVAL; - plat_dat->interface = interface; + + plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + if (IS_ERR(plat_dat)) + return PTR_ERR(plat_dat); /* platform data specifying hardware features and callbacks. * hardware features were copied from Allwinner drivers. */ + plat_dat->interface = interface; plat_dat->rx_coe = STMMAC_RX_COE_TYPE2; plat_dat->tx_coe = 1; plat_dat->has_sun8i = true; @@ -1216,7 +1216,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) ret = sun8i_dwmac_init(pdev, plat_dat->bsp_priv); if (ret) - return ret; + goto dwmac_deconfig; ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); if (ret) @@ -1230,7 +1230,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) if (gmac->variant->soc_has_internal_phy) { ret = get_ephy_nodes(priv); if (ret) - goto dwmac_exit; + goto dwmac_remove; ret = sun8i_dwmac_register_mdio_mux(priv); if (ret) { dev_err(&pdev->dev, "Failed to register mux\n"); @@ -1239,15 +1239,20 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) } else { ret = sun8i_dwmac_reset(priv); if (ret) - goto dwmac_exit; + goto dwmac_remove; } return ret; dwmac_mux: sun8i_dwmac_unset_syscon(gmac); +dwmac_remove: + stmmac_dvr_remove(&pdev->dev); dwmac_exit: - stmmac_pltfr_remove(pdev); -return ret; + sun8i_dwmac_exit(pdev, gmac); +dwmac_deconfig: + stmmac_remove_config_dt(pdev, plat_dat); + + return ret; } static const struct of_device_id sun8i_dwmac_match[] = { From 529254216773acd5039c07aa18cf06fd1f9fccdd Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sun, 3 Jan 2021 05:17:42 -0600 Subject: [PATCH 379/547] net: stmmac: dwmac-sun8i: Balance internal PHY resource references While stmmac_pltfr_remove calls sun8i_dwmac_exit, the sun8i_dwmac_init and sun8i_dwmac_exit functions are also called by the stmmac_platform suspend/resume callbacks. They may be called many times during the device's lifetime and should not release resources used by the driver. Furthermore, there was no error handling in case registering the MDIO mux failed during probe, and the EPHY clock was never released at all. Fix all of these issues by moving the deinitialization code to a driver removal callback. Also ensure the EPHY is powered down before removal. Fixes: 634db83b8265 ("net: stmmac: dwmac-sun8i: Handle integrated/external MDIOs") Signed-off-by: Samuel Holland Reviewed-by: Chen-Yu Tsai Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index b20f261fce5b..a05dee5d4584 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1004,17 +1004,12 @@ static void sun8i_dwmac_exit(struct platform_device *pdev, void *priv) struct sunxi_priv_data *gmac = priv; if (gmac->variant->soc_has_internal_phy) { - /* sun8i_dwmac_exit could be called with mdiomux uninit */ - if (gmac->mux_handle) - mdio_mux_uninit(gmac->mux_handle); if (gmac->internal_phy_powered) sun8i_dwmac_unpower_internal_phy(gmac); } sun8i_dwmac_unset_syscon(gmac); - reset_control_put(gmac->rst_ephy); - clk_disable_unprepare(gmac->tx_clk); if (gmac->regulator) @@ -1244,6 +1239,8 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) return ret; dwmac_mux: + reset_control_put(gmac->rst_ephy); + clk_put(gmac->ephy_clk); sun8i_dwmac_unset_syscon(gmac); dwmac_remove: stmmac_dvr_remove(&pdev->dev); @@ -1255,6 +1252,24 @@ dwmac_deconfig: return ret; } +static int sun8i_dwmac_remove(struct platform_device *pdev) +{ + struct net_device *ndev = platform_get_drvdata(pdev); + struct stmmac_priv *priv = netdev_priv(ndev); + struct sunxi_priv_data *gmac = priv->plat->bsp_priv; + + if (gmac->variant->soc_has_internal_phy) { + mdio_mux_uninit(gmac->mux_handle); + sun8i_dwmac_unpower_internal_phy(gmac); + reset_control_put(gmac->rst_ephy); + clk_put(gmac->ephy_clk); + } + + stmmac_pltfr_remove(pdev); + + return 0; +} + static const struct of_device_id sun8i_dwmac_match[] = { { .compatible = "allwinner,sun8i-h3-emac", .data = &emac_variant_h3 }, @@ -1274,7 +1289,7 @@ MODULE_DEVICE_TABLE(of, sun8i_dwmac_match); static struct platform_driver sun8i_dwmac_driver = { .probe = sun8i_dwmac_probe, - .remove = stmmac_pltfr_remove, + .remove = sun8i_dwmac_remove, .driver = { .name = "dwmac-sun8i", .pm = &stmmac_pltfr_pm_ops, From b8239638853e3e37b287e4bd4d57b41f14c78550 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sun, 3 Jan 2021 05:17:43 -0600 Subject: [PATCH 380/547] net: stmmac: dwmac-sun8i: Balance internal PHY power sun8i_dwmac_exit calls sun8i_dwmac_unpower_internal_phy, but sun8i_dwmac_init did not call sun8i_dwmac_power_internal_phy. This caused PHY power to remain off after a suspend/resume cycle. Fix this by recording if PHY power should be restored, and if so, restoring it. Fixes: 634db83b8265 ("net: stmmac: dwmac-sun8i: Handle integrated/external MDIOs") Signed-off-by: Samuel Holland Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index a05dee5d4584..e2c25c1c702a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -64,6 +64,7 @@ struct emac_variant { * @variant: reference to the current board variant * @regmap: regmap for using the syscon * @internal_phy_powered: Does the internal PHY is enabled + * @use_internal_phy: Is the internal PHY selected for use * @mux_handle: Internal pointer used by mdio-mux lib */ struct sunxi_priv_data { @@ -74,6 +75,7 @@ struct sunxi_priv_data { const struct emac_variant *variant; struct regmap_field *regmap_field; bool internal_phy_powered; + bool use_internal_phy; void *mux_handle; }; @@ -539,8 +541,11 @@ static const struct stmmac_dma_ops sun8i_dwmac_dma_ops = { .dma_interrupt = sun8i_dwmac_dma_interrupt, }; +static int sun8i_dwmac_power_internal_phy(struct stmmac_priv *priv); + static int sun8i_dwmac_init(struct platform_device *pdev, void *priv) { + struct net_device *ndev = platform_get_drvdata(pdev); struct sunxi_priv_data *gmac = priv; int ret; @@ -554,13 +559,25 @@ static int sun8i_dwmac_init(struct platform_device *pdev, void *priv) ret = clk_prepare_enable(gmac->tx_clk); if (ret) { - if (gmac->regulator) - regulator_disable(gmac->regulator); dev_err(&pdev->dev, "Could not enable AHB clock\n"); - return ret; + goto err_disable_regulator; + } + + if (gmac->use_internal_phy) { + ret = sun8i_dwmac_power_internal_phy(netdev_priv(ndev)); + if (ret) + goto err_disable_clk; } return 0; + +err_disable_clk: + clk_disable_unprepare(gmac->tx_clk); +err_disable_regulator: + if (gmac->regulator) + regulator_disable(gmac->regulator); + + return ret; } static void sun8i_dwmac_core_init(struct mac_device_info *hw, @@ -831,7 +848,6 @@ static int mdio_mux_syscon_switch_fn(int current_child, int desired_child, struct sunxi_priv_data *gmac = priv->plat->bsp_priv; u32 reg, val; int ret = 0; - bool need_power_ephy = false; if (current_child ^ desired_child) { regmap_field_read(gmac->regmap_field, ®); @@ -839,13 +855,12 @@ static int mdio_mux_syscon_switch_fn(int current_child, int desired_child, case DWMAC_SUN8I_MDIO_MUX_INTERNAL_ID: dev_info(priv->device, "Switch mux to internal PHY"); val = (reg & ~H3_EPHY_MUX_MASK) | H3_EPHY_SELECT; - - need_power_ephy = true; + gmac->use_internal_phy = true; break; case DWMAC_SUN8I_MDIO_MUX_EXTERNAL_ID: dev_info(priv->device, "Switch mux to external PHY"); val = (reg & ~H3_EPHY_MUX_MASK) | H3_EPHY_SHUTDOWN; - need_power_ephy = false; + gmac->use_internal_phy = false; break; default: dev_err(priv->device, "Invalid child ID %x\n", @@ -853,7 +868,7 @@ static int mdio_mux_syscon_switch_fn(int current_child, int desired_child, return -EINVAL; } regmap_field_write(gmac->regmap_field, val); - if (need_power_ephy) { + if (gmac->use_internal_phy) { ret = sun8i_dwmac_power_internal_phy(priv); if (ret) return ret; From 9b1e39cf5dd81f33186cdb950fcf75a121f1a9a7 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Sun, 3 Jan 2021 05:17:44 -0600 Subject: [PATCH 381/547] net: stmmac: dwmac-sun8i: Balance syscon (de)initialization Previously, sun8i_dwmac_set_syscon was called from a chain of functions in several different files: sun8i_dwmac_probe stmmac_dvr_probe stmmac_hw_init stmmac_hwif_init sun8i_dwmac_setup sun8i_dwmac_set_syscon which made the lifetime of the syscon values hard to reason about. Part of the problem is that there is no similar platform driver callback from stmmac_dvr_remove. As a result, the driver unset the syscon value in sun8i_dwmac_exit, but this leaves it uninitialized after a suspend/ resume cycle. It was also unset a second time (outside sun8i_dwmac_exit) in the probe error path. Move the init to the earliest available place in sun8i_dwmac_probe (after stmmac_probe_config_dt, which initializes plat_dat), and the deinit to the corresponding position in the cleanup order. Since priv is not filled in until stmmac_dvr_probe, this requires changing the sun8i_dwmac_set_syscon parameters to priv's two relevant members. Fixes: 9f93ac8d4085 ("net-next: stmmac: Add dwmac-sun8i") Fixes: 634db83b8265 ("net: stmmac: dwmac-sun8i: Handle integrated/external MDIOs") Signed-off-by: Samuel Holland Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index e2c25c1c702a..a5e0eff4a387 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -898,22 +898,23 @@ static int sun8i_dwmac_register_mdio_mux(struct stmmac_priv *priv) return ret; } -static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv) +static int sun8i_dwmac_set_syscon(struct device *dev, + struct plat_stmmacenet_data *plat) { - struct sunxi_priv_data *gmac = priv->plat->bsp_priv; - struct device_node *node = priv->device->of_node; + struct sunxi_priv_data *gmac = plat->bsp_priv; + struct device_node *node = dev->of_node; int ret; u32 reg, val; ret = regmap_field_read(gmac->regmap_field, &val); if (ret) { - dev_err(priv->device, "Fail to read from regmap field.\n"); + dev_err(dev, "Fail to read from regmap field.\n"); return ret; } reg = gmac->variant->default_syscon_value; if (reg != val) - dev_warn(priv->device, + dev_warn(dev, "Current syscon value is not the default %x (expect %x)\n", val, reg); @@ -926,9 +927,9 @@ static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv) /* Force EPHY xtal frequency to 24MHz. */ reg |= H3_EPHY_CLK_SEL; - ret = of_mdio_parse_addr(priv->device, priv->plat->phy_node); + ret = of_mdio_parse_addr(dev, plat->phy_node); if (ret < 0) { - dev_err(priv->device, "Could not parse MDIO addr\n"); + dev_err(dev, "Could not parse MDIO addr\n"); return ret; } /* of_mdio_parse_addr returns a valid (0 ~ 31) PHY @@ -944,17 +945,17 @@ static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv) if (!of_property_read_u32(node, "allwinner,tx-delay-ps", &val)) { if (val % 100) { - dev_err(priv->device, "tx-delay must be a multiple of 100\n"); + dev_err(dev, "tx-delay must be a multiple of 100\n"); return -EINVAL; } val /= 100; - dev_dbg(priv->device, "set tx-delay to %x\n", val); + dev_dbg(dev, "set tx-delay to %x\n", val); if (val <= gmac->variant->tx_delay_max) { reg &= ~(gmac->variant->tx_delay_max << SYSCON_ETXDC_SHIFT); reg |= (val << SYSCON_ETXDC_SHIFT); } else { - dev_err(priv->device, "Invalid TX clock delay: %d\n", + dev_err(dev, "Invalid TX clock delay: %d\n", val); return -EINVAL; } @@ -962,17 +963,17 @@ static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv) if (!of_property_read_u32(node, "allwinner,rx-delay-ps", &val)) { if (val % 100) { - dev_err(priv->device, "rx-delay must be a multiple of 100\n"); + dev_err(dev, "rx-delay must be a multiple of 100\n"); return -EINVAL; } val /= 100; - dev_dbg(priv->device, "set rx-delay to %x\n", val); + dev_dbg(dev, "set rx-delay to %x\n", val); if (val <= gmac->variant->rx_delay_max) { reg &= ~(gmac->variant->rx_delay_max << SYSCON_ERXDC_SHIFT); reg |= (val << SYSCON_ERXDC_SHIFT); } else { - dev_err(priv->device, "Invalid RX clock delay: %d\n", + dev_err(dev, "Invalid RX clock delay: %d\n", val); return -EINVAL; } @@ -983,7 +984,7 @@ static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv) if (gmac->variant->support_rmii) reg &= ~SYSCON_RMII_EN; - switch (priv->plat->interface) { + switch (plat->interface) { case PHY_INTERFACE_MODE_MII: /* default */ break; @@ -997,8 +998,8 @@ static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv) reg |= SYSCON_RMII_EN | SYSCON_ETCS_EXT_GMII; break; default: - dev_err(priv->device, "Unsupported interface mode: %s", - phy_modes(priv->plat->interface)); + dev_err(dev, "Unsupported interface mode: %s", + phy_modes(plat->interface)); return -EINVAL; } @@ -1023,8 +1024,6 @@ static void sun8i_dwmac_exit(struct platform_device *pdev, void *priv) sun8i_dwmac_unpower_internal_phy(gmac); } - sun8i_dwmac_unset_syscon(gmac); - clk_disable_unprepare(gmac->tx_clk); if (gmac->regulator) @@ -1059,16 +1058,11 @@ static struct mac_device_info *sun8i_dwmac_setup(void *ppriv) { struct mac_device_info *mac; struct stmmac_priv *priv = ppriv; - int ret; mac = devm_kzalloc(priv->device, sizeof(*mac), GFP_KERNEL); if (!mac) return NULL; - ret = sun8i_dwmac_set_syscon(priv); - if (ret) - return NULL; - mac->pcsr = priv->ioaddr; mac->mac = &sun8i_dwmac_ops; mac->dma = &sun8i_dwmac_dma_ops; @@ -1224,10 +1218,14 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) plat_dat->exit = sun8i_dwmac_exit; plat_dat->setup = sun8i_dwmac_setup; - ret = sun8i_dwmac_init(pdev, plat_dat->bsp_priv); + ret = sun8i_dwmac_set_syscon(&pdev->dev, plat_dat); if (ret) goto dwmac_deconfig; + ret = sun8i_dwmac_init(pdev, plat_dat->bsp_priv); + if (ret) + goto dwmac_syscon; + ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); if (ret) goto dwmac_exit; @@ -1256,11 +1254,12 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) dwmac_mux: reset_control_put(gmac->rst_ephy); clk_put(gmac->ephy_clk); - sun8i_dwmac_unset_syscon(gmac); dwmac_remove: stmmac_dvr_remove(&pdev->dev); dwmac_exit: sun8i_dwmac_exit(pdev, gmac); +dwmac_syscon: + sun8i_dwmac_unset_syscon(gmac); dwmac_deconfig: stmmac_remove_config_dt(pdev, plat_dat); @@ -1281,6 +1280,7 @@ static int sun8i_dwmac_remove(struct platform_device *pdev) } stmmac_pltfr_remove(pdev); + sun8i_dwmac_unset_syscon(gmac); return 0; } From 9f9d41f03bb07069e6e83ff4720cfea74a63898d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Jan 2021 17:22:24 -0800 Subject: [PATCH 382/547] docs: net: fix documentation on .ndo_get_stats Fix calling context. Signed-off-by: Jakub Kicinski Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- Documentation/networking/netdevices.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index 5a85fcc80c76..e65665c5ab50 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -64,8 +64,8 @@ ndo_do_ioctl: Context: process ndo_get_stats: - Synchronization: dev_base_lock rwlock. - Context: nominally process, but don't sleep inside an rwlock + Synchronization: rtnl_lock() semaphore, dev_base_lock rwlock, or RCU. + Context: atomic (can't sleep under rwlock or RCU) ndo_start_xmit: Synchronization: __netif_tx_lock spinlock. From f04bbcbf1e38d192e94bbfa126731a52332c40b1 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Tue, 5 Jan 2021 11:37:26 +0800 Subject: [PATCH 383/547] net: hns3: fix a phy loopback fail issue When phy driver does not implement the set_loopback interface, phy loopback test will return -EOPNOTSUPP, and the loopback test will fail. So when phy driver does not implement the set_loopback interface, don't do phy loopback test. Fixes: c9765a89d142 ("net: hns3: add phy selftest function") Signed-off-by: Yonglong Liu Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e6f37f91c489..135bd0a0dcaf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -752,7 +752,8 @@ static int hclge_get_sset_count(struct hnae3_handle *handle, int stringset) handle->flags |= HNAE3_SUPPORT_SERDES_SERIAL_LOOPBACK; handle->flags |= HNAE3_SUPPORT_SERDES_PARALLEL_LOOPBACK; - if (hdev->hw.mac.phydev) { + if (hdev->hw.mac.phydev && hdev->hw.mac.phydev->drv && + hdev->hw.mac.phydev->drv->set_loopback) { count += 1; handle->flags |= HNAE3_SUPPORT_PHY_LOOPBACK; } From 65e61e3c2a619c4d4b873885b2d5394025ed117b Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Tue, 5 Jan 2021 11:37:27 +0800 Subject: [PATCH 384/547] net: hns3: fix the number of queues actually used by ARQ HCLGE_MBX_MAX_ARQ_MSG_NUM is used to apply memory for the number of queues used by ARQ(Asynchronous Receive Queue), so the head and tail pointers should also use this macro. Fixes: 07a0556a3a73 ("net: hns3: Changes to support ARQ(Asynchronous Receive Queue)") Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h b/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h index fb5e8842983c..33defa4c180a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h +++ b/drivers/net/ethernet/hisilicon/hns3/hclge_mbx.h @@ -169,7 +169,7 @@ struct hclgevf_mbx_arq_ring { #define hclge_mbx_ring_ptr_move_crq(crq) \ (crq->next_to_use = (crq->next_to_use + 1) % crq->desc_num) #define hclge_mbx_tail_ptr_move_arq(arq) \ - (arq.tail = (arq.tail + 1) % HCLGE_MBX_MAX_ARQ_MSG_SIZE) + (arq.tail = (arq.tail + 1) % HCLGE_MBX_MAX_ARQ_MSG_NUM) #define hclge_mbx_head_ptr_move_arq(arq) \ - (arq.head = (arq.head + 1) % HCLGE_MBX_MAX_ARQ_MSG_SIZE) + (arq.head = (arq.head + 1) % HCLGE_MBX_MAX_ARQ_MSG_NUM) #endif From ab6e32d2913a594bc8f822ce4a75c400190b2ecc Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Tue, 5 Jan 2021 11:37:28 +0800 Subject: [PATCH 385/547] net: hns3: fix incorrect handling of sctp6 rss tuple For DEVICE_VERSION_V2, the hardware only supports src-ip, dst-ip and verification-tag for rss tuple set of sctp6 packet. For DEVICE_VERSION_V3, the hardware supports src-port and dst-port as well. Currently, when user queries the sctp6 rss tuples info, some unsupported information will be showed on V2. So add a check for hardware version when initializing and queries sctp6 rss tuple to fix this issue. Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility Layer Support") Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 ++++-- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h | 2 ++ .../net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 9 ++++++--- .../net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h | 2 ++ 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 135bd0a0dcaf..c242883fea5d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -4538,8 +4538,8 @@ static int hclge_set_rss_tuple(struct hnae3_handle *handle, req->ipv4_sctp_en = tuple_sets; break; case SCTP_V6_FLOW: - if ((nfc->data & RXH_L4_B_0_1) || - (nfc->data & RXH_L4_B_2_3)) + if (hdev->ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2 && + (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3))) return -EINVAL; req->ipv6_sctp_en = tuple_sets; @@ -4731,6 +4731,8 @@ static void hclge_rss_init_cfg(struct hclge_dev *hdev) vport[i].rss_tuple_sets.ipv6_udp_en = HCLGE_RSS_INPUT_TUPLE_OTHER; vport[i].rss_tuple_sets.ipv6_sctp_en = + hdev->ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2 ? + HCLGE_RSS_INPUT_TUPLE_SCTP_NO_PORT : HCLGE_RSS_INPUT_TUPLE_SCTP; vport[i].rss_tuple_sets.ipv6_fragment_en = HCLGE_RSS_INPUT_TUPLE_OTHER; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 50a294dfaff5..ca46bc9110d7 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -107,6 +107,8 @@ #define HCLGE_D_IP_BIT BIT(2) #define HCLGE_S_IP_BIT BIT(3) #define HCLGE_V_TAG_BIT BIT(4) +#define HCLGE_RSS_INPUT_TUPLE_SCTP_NO_PORT \ + (HCLGE_D_IP_BIT | HCLGE_S_IP_BIT | HCLGE_V_TAG_BIT) #define HCLGE_RSS_TC_SIZE_0 1 #define HCLGE_RSS_TC_SIZE_1 2 diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 145757cb70f9..674b3a22e91f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -917,8 +917,8 @@ static int hclgevf_set_rss_tuple(struct hnae3_handle *handle, req->ipv4_sctp_en = tuple_sets; break; case SCTP_V6_FLOW: - if ((nfc->data & RXH_L4_B_0_1) || - (nfc->data & RXH_L4_B_2_3)) + if (hdev->ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2 && + (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3))) return -EINVAL; req->ipv6_sctp_en = tuple_sets; @@ -2502,7 +2502,10 @@ static void hclgevf_rss_init_cfg(struct hclgevf_dev *hdev) tuple_sets->ipv4_fragment_en = HCLGEVF_RSS_INPUT_TUPLE_OTHER; tuple_sets->ipv6_tcp_en = HCLGEVF_RSS_INPUT_TUPLE_OTHER; tuple_sets->ipv6_udp_en = HCLGEVF_RSS_INPUT_TUPLE_OTHER; - tuple_sets->ipv6_sctp_en = HCLGEVF_RSS_INPUT_TUPLE_SCTP; + tuple_sets->ipv6_sctp_en = + hdev->ae_dev->dev_version <= HNAE3_DEVICE_VERSION_V2 ? + HCLGEVF_RSS_INPUT_TUPLE_SCTP_NO_PORT : + HCLGEVF_RSS_INPUT_TUPLE_SCTP; tuple_sets->ipv6_fragment_en = HCLGEVF_RSS_INPUT_TUPLE_OTHER; } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index 1b183bc35604..f6d817a3edcb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -122,6 +122,8 @@ #define HCLGEVF_D_IP_BIT BIT(2) #define HCLGEVF_S_IP_BIT BIT(3) #define HCLGEVF_V_TAG_BIT BIT(4) +#define HCLGEVF_RSS_INPUT_TUPLE_SCTP_NO_PORT \ + (HCLGEVF_D_IP_BIT | HCLGEVF_S_IP_BIT | HCLGEVF_V_TAG_BIT) #define HCLGEVF_STATS_TIMER_INTERVAL 36U From 7a68d725e4ea384977445e0bcaed3d7de83ab5b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20K=2E=20Sepp=C3=A4nen?= Date: Tue, 5 Jan 2021 06:52:49 +0200 Subject: [PATCH 386/547] net: cdc_ncm: correct overhead in delayed_ndp_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aligning to tx_ndp_modulus is not sufficient because the next align call can be cdc_ncm_align_tail, which can add up to ctx->tx_modulus + ctx->tx_remainder - 1 bytes. This used to lead to occasional crashes on a Huawei 909s-120 LTE module as follows: - the condition marked /* if there is a remaining skb [...] */ is true so the swaps happen - skb_out is set from ctx->tx_curr_skb - skb_out->len is exactly 0x3f52 - ctx->tx_curr_size is 0x4000 and delayed_ndp_size is 0xac (note that the sum of skb_out->len and delayed_ndp_size is 0x3ffe) - the for loop over n is executed once - the cdc_ncm_align_tail call marked /* align beginning of next frame */ increases skb_out->len to 0x3f56 (the sum is now 0x4002) - the condition marked /* check if we had enough room left [...] */ is false so we break out of the loop - the condition marked /* If requested, put NDP at end of frame. */ is true so the NDP is written into skb_out - now skb_out->len is 0x4002, so padding_count is minus two interpreted as an unsigned number, which is used as the length argument to memset, leading to a crash with various symptoms but usually including > Call Trace: > > cdc_ncm_fill_tx_frame+0x83a/0x970 [cdc_ncm] > cdc_mbim_tx_fixup+0x1d9/0x240 [cdc_mbim] > usbnet_start_xmit+0x5d/0x720 [usbnet] The cdc_ncm_align_tail call first aligns on a ctx->tx_modulus boundary (adding at most ctx->tx_modulus-1 bytes), then adds ctx->tx_remainder bytes. Alternatively, the next alignment call can occur in cdc_ncm_ndp16 or cdc_ncm_ndp32, in which case at most ctx->tx_ndp_modulus-1 bytes are added. A similar problem has occurred before, and the code is nontrivial to reason about, so add a guard before the crashing call. By that time it is too late to prevent any memory corruption (we'll have written past the end of the buffer already) but we can at least try to get a warning written into an on-disk log by avoiding the hard crash caused by padding past the buffer with a huge number of zeros. Signed-off-by: Jouni K. Seppänen Fixes: 4a0e3e989d66 ("cdc_ncm: Add support for moving NDP to end of NCM frame") Link: https://bugzilla.kernel.org/show_bug.cgi?id=209407 Reported-by: kernel test robot Reviewed-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ncm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 3b816a4731f2..5a78848db93f 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -1199,7 +1199,10 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign) * accordingly. Otherwise, we should check here. */ if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) - delayed_ndp_size = ALIGN(ctx->max_ndp_size, ctx->tx_ndp_modulus); + delayed_ndp_size = ctx->max_ndp_size + + max_t(u32, + ctx->tx_ndp_modulus, + ctx->tx_modulus + ctx->tx_remainder) - 1; else delayed_ndp_size = 0; @@ -1410,7 +1413,8 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign) if (!(dev->driver_info->flags & FLAG_SEND_ZLP) && skb_out->len > ctx->min_tx_pkt) { padding_count = ctx->tx_curr_size - skb_out->len; - skb_put_zero(skb_out, padding_count); + if (!WARN_ON(padding_count > ctx->tx_curr_size)) + skb_put_zero(skb_out, padding_count); } else if (skb_out->len < ctx->tx_curr_size && (skb_out->len % dev->maxpacket) == 0) { skb_put_u8(skb_out, 0); /* force short packet */ From 4beb17e553b49c3dd74505c9f361e756aaae653e Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Tue, 5 Jan 2021 13:57:54 +0800 Subject: [PATCH 387/547] net: qrtr: fix null-ptr-deref in qrtr_ns_remove A null-ptr-deref bug is reported by Hulk Robot like this: -------------- KASAN: null-ptr-deref in range [0x0000000000000128-0x000000000000012f] Call Trace: qrtr_ns_remove+0x22/0x40 [ns] qrtr_proto_fini+0xa/0x31 [qrtr] __x64_sys_delete_module+0x337/0x4e0 do_syscall_64+0x34/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x468ded -------------- When qrtr_ns_init fails in qrtr_proto_init, qrtr_ns_remove which would be called later on would raise a null-ptr-deref because qrtr_ns.workqueue has been destroyed. Fix it by making qrtr_ns_init have a return value and adding a check in qrtr_proto_init. Reported-by: Hulk Robot Signed-off-by: Qinglang Miao Signed-off-by: David S. Miller --- net/qrtr/ns.c | 7 ++++--- net/qrtr/qrtr.c | 16 +++++++++++----- net/qrtr/qrtr.h | 2 +- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 56aaf8cb6527..8d00dfe8139e 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -755,7 +755,7 @@ static void qrtr_ns_data_ready(struct sock *sk) queue_work(qrtr_ns.workqueue, &qrtr_ns.work); } -void qrtr_ns_init(void) +int qrtr_ns_init(void) { struct sockaddr_qrtr sq; int ret; @@ -766,7 +766,7 @@ void qrtr_ns_init(void) ret = sock_create_kern(&init_net, AF_QIPCRTR, SOCK_DGRAM, PF_QIPCRTR, &qrtr_ns.sock); if (ret < 0) - return; + return ret; ret = kernel_getsockname(qrtr_ns.sock, (struct sockaddr *)&sq); if (ret < 0) { @@ -797,12 +797,13 @@ void qrtr_ns_init(void) if (ret < 0) goto err_wq; - return; + return 0; err_wq: destroy_workqueue(qrtr_ns.workqueue); err_sock: sock_release(qrtr_ns.sock); + return ret; } EXPORT_SYMBOL_GPL(qrtr_ns_init); diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index f4ab3ca6d73b..b34358282f37 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -1287,13 +1287,19 @@ static int __init qrtr_proto_init(void) return rc; rc = sock_register(&qrtr_family); - if (rc) { - proto_unregister(&qrtr_proto); - return rc; - } + if (rc) + goto err_proto; - qrtr_ns_init(); + rc = qrtr_ns_init(); + if (rc) + goto err_sock; + return 0; + +err_sock: + sock_unregister(qrtr_family.family); +err_proto: + proto_unregister(&qrtr_proto); return rc; } postcore_initcall(qrtr_proto_init); diff --git a/net/qrtr/qrtr.h b/net/qrtr/qrtr.h index dc2b67f17927..3f2d28696062 100644 --- a/net/qrtr/qrtr.h +++ b/net/qrtr/qrtr.h @@ -29,7 +29,7 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep); int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len); -void qrtr_ns_init(void); +int qrtr_ns_init(void); void qrtr_ns_remove(void); From 445c6198fe7be03b7d38e66fe8d4b3187bc251d4 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 5 Jan 2021 20:15:15 +1100 Subject: [PATCH 388/547] net: ethernet: fs_enet: Add missing MODULE_LICENSE Since commit 1d6cd3929360 ("modpost: turn missing MODULE_LICENSE() into error") the ppc32_allmodconfig build fails with: ERROR: modpost: missing MODULE_LICENSE() in drivers/net/ethernet/freescale/fs_enet/mii-fec.o ERROR: modpost: missing MODULE_LICENSE() in drivers/net/ethernet/freescale/fs_enet/mii-bitbang.o Add the missing MODULE_LICENSEs to fix the build. Both files include a copyright header indicating they are GPL v2. Signed-off-by: Michael Ellerman Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c | 1 + drivers/net/ethernet/freescale/fs_enet/mii-fec.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c index c8e5d889bd81..21de56345503 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c +++ b/drivers/net/ethernet/freescale/fs_enet/mii-bitbang.c @@ -223,3 +223,4 @@ static struct platform_driver fs_enet_bb_mdio_driver = { }; module_platform_driver(fs_enet_bb_mdio_driver); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c index 8b51ee142fa3..152f4d83765a 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c +++ b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c @@ -224,3 +224,4 @@ static struct platform_driver fs_enet_fec_mdio_driver = { }; module_platform_driver(fs_enet_fec_mdio_driver); +MODULE_LICENSE("GPL"); From 3503ee6c0bec5f173d606359e6384a5ef85492fb Mon Sep 17 00:00:00 2001 From: Po-Hsu Lin Date: Tue, 5 Jan 2021 18:17:40 +0800 Subject: [PATCH 389/547] selftests: fix the return value for UDP GRO test The udpgro.sh will always return 0 (unless the bpf selftest was not build first) even if there are some failed sub test-cases. Therefore the kselftest framework will report this case is OK. Check and return the exit status of each test to make it easier to spot real failures. Signed-off-by: Po-Hsu Lin Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgro.sh | 34 +++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index ac2a30be9b32..f8a19f548ae9 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -5,6 +5,14 @@ readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)" +# set global exit status, but never reset nonzero one. +check_err() +{ + if [ $ret -eq 0 ]; then + ret=$1 + fi +} + cleanup() { local -r jobs="$(jobs -p)" local -r ns="$(ip netns list|grep $PEER_NS)" @@ -44,7 +52,9 @@ run_one() { # Hack: let bg programs complete the startup sleep 0.1 ./udpgso_bench_tx ${tx_args} + ret=$? wait $(jobs -p) + return $ret } run_test() { @@ -87,8 +97,10 @@ run_one_nat() { sleep 0.1 ./udpgso_bench_tx ${tx_args} + ret=$? kill -INT $pid wait $(jobs -p) + return $ret } run_one_2sock() { @@ -110,7 +122,9 @@ run_one_2sock() { sleep 0.1 # first UDP GSO socket should be closed at this point ./udpgso_bench_tx ${tx_args} + ret=$? wait $(jobs -p) + return $ret } run_nat_test() { @@ -131,36 +145,54 @@ run_all() { local -r core_args="-l 4" local -r ipv4_args="${core_args} -4 -D 192.168.1.1" local -r ipv6_args="${core_args} -6 -D 2001:db8::1" + ret=0 echo "ipv4" run_test "no GRO" "${ipv4_args} -M 10 -s 1400" "-4 -n 10 -l 1400" + check_err $? # explicitly check we are not receiving UDP_SEGMENT cmsg (-S -1) # when GRO does not take place run_test "no GRO chk cmsg" "${ipv4_args} -M 10 -s 1400" "-4 -n 10 -l 1400 -S -1" + check_err $? # the GSO packets are aggregated because: # * veth schedule napi after each xmit # * segmentation happens in BH context, veth napi poll is delayed after # the transmission of the last segment run_test "GRO" "${ipv4_args} -M 1 -s 14720 -S 0 " "-4 -n 1 -l 14720" + check_err $? run_test "GRO chk cmsg" "${ipv4_args} -M 1 -s 14720 -S 0 " "-4 -n 1 -l 14720 -S 1472" + check_err $? run_test "GRO with custom segment size" "${ipv4_args} -M 1 -s 14720 -S 500 " "-4 -n 1 -l 14720" + check_err $? run_test "GRO with custom segment size cmsg" "${ipv4_args} -M 1 -s 14720 -S 500 " "-4 -n 1 -l 14720 -S 500" + check_err $? run_nat_test "bad GRO lookup" "${ipv4_args} -M 1 -s 14720 -S 0" "-n 10 -l 1472" + check_err $? run_2sock_test "multiple GRO socks" "${ipv4_args} -M 1 -s 14720 -S 0 " "-4 -n 1 -l 14720 -S 1472" + check_err $? echo "ipv6" run_test "no GRO" "${ipv6_args} -M 10 -s 1400" "-n 10 -l 1400" + check_err $? run_test "no GRO chk cmsg" "${ipv6_args} -M 10 -s 1400" "-n 10 -l 1400 -S -1" + check_err $? run_test "GRO" "${ipv6_args} -M 1 -s 14520 -S 0" "-n 1 -l 14520" + check_err $? run_test "GRO chk cmsg" "${ipv6_args} -M 1 -s 14520 -S 0" "-n 1 -l 14520 -S 1452" + check_err $? run_test "GRO with custom segment size" "${ipv6_args} -M 1 -s 14520 -S 500" "-n 1 -l 14520" + check_err $? run_test "GRO with custom segment size cmsg" "${ipv6_args} -M 1 -s 14520 -S 500" "-n 1 -l 14520 -S 500" + check_err $? run_nat_test "bad GRO lookup" "${ipv6_args} -M 1 -s 14520 -S 0" "-n 10 -l 1452" + check_err $? run_2sock_test "multiple GRO socks" "${ipv6_args} -M 1 -s 14520 -S 0 " "-n 1 -l 14520 -S 1452" + check_err $? + return $ret } if [ ! -f ../bpf/xdp_dummy.o ]; then @@ -180,3 +212,5 @@ elif [[ $1 == "__subprocess_2sock" ]]; then shift run_one_2sock $@ fi + +exit $? From 67208692802ce3cacfa00fe586dc0cb1bef0a51c Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 6 Jan 2021 00:42:19 +0100 Subject: [PATCH 390/547] tools/resolve_btfids: Warn when having multiple IDs for single type The kernel image can contain multiple types (structs/unions) with the same name. This causes distinct type hierarchies in BTF data and makes resolve_btfids fail with error like: BTFIDS vmlinux FAILED unresolved symbol udp6_sock as reported by Qais Yousef [1]. This change adds warning when multiple types of the same name are detected: BTFIDS vmlinux WARN: multiple IDs found for 'file': 526, 113351 - using 526 WARN: multiple IDs found for 'sk_buff': 2744, 113958 - using 2744 We keep the lower ID for the given type instance and let the build continue. Also changing the 'nr' variable name to 'nr_types' to avoid confusion. [1] https://lore.kernel.org/lkml/20201229151352.6hzmjvu3qh6p2qgg@e107158-lin/ Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210105234219.970039-1-jolsa@kernel.org --- tools/bpf/resolve_btfids/main.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index e3ea569ee125..7409d7860aa6 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -139,6 +139,8 @@ int eprintf(int level, int var, const char *fmt, ...) #define pr_debug2(fmt, ...) pr_debugN(2, pr_fmt(fmt), ##__VA_ARGS__) #define pr_err(fmt, ...) \ eprintf(0, verbose, pr_fmt(fmt), ##__VA_ARGS__) +#define pr_info(fmt, ...) \ + eprintf(0, verbose, pr_fmt(fmt), ##__VA_ARGS__) static bool is_btf_id(const char *name) { @@ -472,7 +474,7 @@ static int symbols_resolve(struct object *obj) int nr_funcs = obj->nr_funcs; int err, type_id; struct btf *btf; - __u32 nr; + __u32 nr_types; btf = btf__parse(obj->btf ?: obj->path, NULL); err = libbpf_get_error(btf); @@ -483,12 +485,12 @@ static int symbols_resolve(struct object *obj) } err = -1; - nr = btf__get_nr_types(btf); + nr_types = btf__get_nr_types(btf); /* * Iterate all the BTF types and search for collected symbol IDs. */ - for (type_id = 1; type_id <= nr; type_id++) { + for (type_id = 1; type_id <= nr_types; type_id++) { const struct btf_type *type; struct rb_root *root; struct btf_id *id; @@ -526,8 +528,13 @@ static int symbols_resolve(struct object *obj) id = btf_id__find(root, str); if (id) { - id->id = type_id; - (*nr)--; + if (id->id) { + pr_info("WARN: multiple IDs found for '%s': %d, %d - using %d\n", + str, id->id, type_id, id->id); + } else { + id->id = type_id; + (*nr)--; + } } } From 19fce0470f05031e6af36e49ce222d0f0050d432 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 1 Dec 2020 17:52:43 -0800 Subject: [PATCH 391/547] nvme-fc: avoid calling _nvme_fc_abort_outstanding_ios from interrupt context Recent patches changed calling sequences. nvme_fc_abort_outstanding_ios used to be called from a timeout or work context. Now it is being called in an io completion context, which can be an interrupt handler. Unfortunately, the abort outstanding ios routine attempts to stop nvme queues and nested routines that may try to sleep, which is in conflict with the interrupt handler. Correct replacing the direct call with a work element scheduling, and the abort outstanding ios routine will be called in the work element. Fixes: 95ced8a2c72d ("nvme-fc: eliminate terminate_io use by nvme_fc_error_recovery") Signed-off-by: James Smart Reported-by: Daniel Wagner Tested-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 38373a0e86ef..5f36cfa8136c 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -166,6 +166,7 @@ struct nvme_fc_ctrl { struct blk_mq_tag_set admin_tag_set; struct blk_mq_tag_set tag_set; + struct work_struct ioerr_work; struct delayed_work connect_work; struct kref ref; @@ -1888,6 +1889,15 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl, } } +static void +nvme_fc_ctrl_ioerr_work(struct work_struct *work) +{ + struct nvme_fc_ctrl *ctrl = + container_of(work, struct nvme_fc_ctrl, ioerr_work); + + nvme_fc_error_recovery(ctrl, "transport detected io error"); +} + static void nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) { @@ -2046,7 +2056,7 @@ done: check_error: if (terminate_assoc) - nvme_fc_error_recovery(ctrl, "transport detected io error"); + queue_work(nvme_reset_wq, &ctrl->ioerr_work); } static int @@ -3233,6 +3243,7 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); + cancel_work_sync(&ctrl->ioerr_work); cancel_delayed_work_sync(&ctrl->connect_work); /* * kill the association on the link side. this will block @@ -3449,6 +3460,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); + INIT_WORK(&ctrl->ioerr_work, nvme_fc_ctrl_ioerr_work); spin_lock_init(&ctrl->lock); /* io queue count */ @@ -3540,6 +3552,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, fail_ctrl: nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); + cancel_work_sync(&ctrl->ioerr_work); cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); From 2b54996b7d56badc563755840838614f2fa9c4de Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 7 Dec 2020 12:29:40 -0800 Subject: [PATCH 392/547] nvme-fcloop: Fix sscanf type and list_first_entry_or_null warnings Kernel robot had the following warnings: >> fcloop.c:1506:6: warning: %x in format string (no. 1) requires >> 'unsigned int *' but the argument type is 'signed int *'. >> [invalidScanfArgType_int] >> if (sscanf(buf, "%x:%d:%d", &opcode, &starting, &amount) != 3) >> ^ Resolve by changing opcode from and int to an unsigned int and >> fcloop.c:1632:32: warning: Uninitialized variable: lport [uninitvar] >> ret = __wait_localport_unreg(lport); >> ^ >> fcloop.c:1615:28: warning: Uninitialized variable: nport [uninitvar] >> ret = __remoteport_unreg(nport, rport); >> ^ These aren't actual issues as the values are assigned prior to use. It appears the tool doesn't understand list_first_entry_or_null(). Regardless, quiet the tool by initializing the pointers to NULL at declaration. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 733d9363900e..68213f0a052b 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1501,7 +1501,8 @@ static ssize_t fcloop_set_cmd_drop(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - int opcode, starting, amount; + unsigned int opcode; + int starting, amount; if (sscanf(buf, "%x:%d:%d", &opcode, &starting, &amount) != 3) return -EBADRQC; @@ -1588,8 +1589,8 @@ out_destroy_class: static void __exit fcloop_exit(void) { - struct fcloop_lport *lport; - struct fcloop_nport *nport; + struct fcloop_lport *lport = NULL; + struct fcloop_nport *nport = NULL; struct fcloop_tport *tport; struct fcloop_rport *rport; unsigned long flags; From 7ee5c78ca3895d44e918c38332921983ed678be0 Mon Sep 17 00:00:00 2001 From: Gopal Tiwari Date: Fri, 4 Dec 2020 21:46:57 +0530 Subject: [PATCH 393/547] nvme-pci: mark Samsung PM1725a as IGNORE_DEV_SUBNQN A system with more than one of these SSDs will only have one usable. Hence the kernel fails to detect nvme devices due to duplicate cntlids. [ 6.274554] nvme nvme1: Duplicate cntlid 33 with nvme0, rejecting [ 6.274566] nvme nvme1: Removing after probe failure status: -22 Adding the NVME_QUIRK_IGNORE_DEV_SUBNQN quirk to resolves the issue. Signed-off-by: Gopal Tiwari Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b4385cb0ff60..553871e6962b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3196,7 +3196,8 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ - .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY | + NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */ .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ From 5c11f7d9f843bdd24cd29b95401938bc3f168070 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 21 Dec 2020 00:03:39 -0800 Subject: [PATCH 394/547] nvme-tcp: Fix possible race of io_work and direct send We may send a request (with or without its data) from two paths: 1. From our I/O context nvme_tcp_io_work which is triggered from: - queue_rq - r2t reception - socket data_ready and write_space callbacks 2. Directly from queue_rq if the send_list is empty (because we want to save the context switch associated with scheduling our io_work). However, given that now we have the send_mutex, we may run into a race condition where none of these contexts will send the pending payload to the controller. Both io_work send path and queue_rq send path opportunistically attempt to acquire the send_mutex however queue_rq only attempts to send a single request, and if io_work context fails to acquire the send_mutex it will complete without rescheduling itself. The race can trigger with the following sequence: 1. queue_rq sends request (no incapsule data) and blocks 2. RX path receives r2t - prepares data PDU to send, adds h2cdata PDU to the send_list and schedules io_work 3. io_work triggers and cannot acquire the send_mutex - because of (1), ends without self rescheduling 4. queue_rq completes the send, and completes ==> no context will send the h2cdata - timeout. Fix this by having queue_rq sending as much as it can from the send_list such that if it still has any left, its because the socket buffer is full and the socket write_space callback will trigger, thus guaranteeing that a context will be scheduled to send the h2cdata PDU. Fixes: db5ad6b7f8cd ("nvme-tcp: try to send request in queue_rq context") Reported-by: Potnuri Bharat Teja Reported-by: Samuel Jones Signed-off-by: Sagi Grimberg Tested-by: Potnuri Bharat Teja Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1ba659927442..979ee31b8dd1 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -262,6 +262,16 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, } } +static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue) +{ + int ret; + + /* drain the send queue as much as we can... */ + do { + ret = nvme_tcp_try_send(queue); + } while (ret > 0); +} + static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, bool sync, bool last) { @@ -279,7 +289,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, if (queue->io_cpu == smp_processor_id() && sync && empty && mutex_trylock(&queue->send_mutex)) { queue->more_requests = !last; - nvme_tcp_try_send(queue); + nvme_tcp_send_all(queue); queue->more_requests = false; mutex_unlock(&queue->send_mutex); } else if (last) { From 62df80165d7f197c9c0652e7416164f294a96661 Mon Sep 17 00:00:00 2001 From: Lalithambika Krishnakumar Date: Wed, 23 Dec 2020 14:09:00 -0800 Subject: [PATCH 395/547] nvme: avoid possible double fetch in handling CQE While handling the completion queue, keep a local copy of the command id from the DMA-accessible completion entry. This silences a time-of-check to time-of-use (TOCTOU) warning from KF/x[1], with respect to a Thunderclap[2] vulnerability analysis. The double-read impact appears benign. There may be a theoretical window for @command_id to be used as an adversary-controlled array-index-value for mounting a speculative execution attack, but that mitigation is saved for a potential follow-on. A man-in-the-middle attack on the data payload is out of scope for this analysis and is hopefully mitigated by filesystem integrity mechanisms. [1] https://github.com/intel/kernel-fuzzer-for-xen-project [2] http://thunderclap.io/thunderclap-paper-ndss2019.pdf Signed-off-by: Lalithambika Krishna Kumar Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 553871e6962b..50d9a20568a2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -967,6 +967,7 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq) static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) { struct nvme_completion *cqe = &nvmeq->cqes[idx]; + __u16 command_id = READ_ONCE(cqe->command_id); struct request *req; /* @@ -975,17 +976,17 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) * aborts. We don't even bother to allocate a struct request * for them but rather special case them here. */ - if (unlikely(nvme_is_aen_req(nvmeq->qid, cqe->command_id))) { + if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) { nvme_complete_async_event(&nvmeq->dev->ctrl, cqe->status, &cqe->result); return; } - req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id); + req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id); if (unlikely(!req)) { dev_warn(nvmeq->dev->ctrl.device, "invalid id %d completed on queue %d\n", - cqe->command_id, le16_to_cpu(cqe->sq_id)); + command_id, le16_to_cpu(cqe->sq_id)); return; } From 9b66fc02bec0ca613bc6d4c1d0049f727a95567d Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 30 Dec 2020 20:22:44 +0900 Subject: [PATCH 396/547] nvme: unexport functions with no external caller There are no callers for nvme_reset_ctrl_sync() and nvme_alloc_request_qid() so that we keep the symbols exported. Unexport those functions, mark them static and update the header file respectively. Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 ++---- drivers/nvme/host/nvme.h | 3 --- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ce1b61519441..70a63d7c1d02 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -179,7 +179,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_reset_ctrl); -int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) { int ret; @@ -192,7 +192,6 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) return ret; } -EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) { @@ -578,7 +577,7 @@ struct request *nvme_alloc_request(struct request_queue *q, } EXPORT_SYMBOL_GPL(nvme_alloc_request); -struct request *nvme_alloc_request_qid(struct request_queue *q, +static struct request *nvme_alloc_request_qid(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) { struct request *req; @@ -589,7 +588,6 @@ struct request *nvme_alloc_request_qid(struct request_queue *q, nvme_init_request(req, cmd); return req; } -EXPORT_SYMBOL_GPL(nvme_alloc_request_qid); static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7e49f61f81df..9c4fbfe44c00 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -610,8 +610,6 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags); -struct request *nvme_alloc_request_qid(struct request_queue *q, - struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); void nvme_cleanup_cmd(struct request *req); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); @@ -630,7 +628,6 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); -int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_try_sched_reset(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); From 9ceb7863537748c67fa43ac4f2f565819bbd36e4 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 5 Jan 2021 10:46:54 +0200 Subject: [PATCH 397/547] nvmet-rdma: Fix list_del corruption on queue establishment failure When a queue is in NVMET_RDMA_Q_CONNECTING state, it may has some requests at rsp_wait_list. In case a disconnect occurs at this state, no one will empty this list and will return the requests to free_rsps list. Normally nvmet_rdma_queue_established() free those requests after moving the queue to NVMET_RDMA_Q_LIVE state, but in this case __nvmet_rdma_queue_disconnect() is called before. The crash happens at nvmet_rdma_free_rsps() when calling list_del(&rsp->free_list), because the request exists only at the wait list. To fix the issue, simply clear rsp_wait_list when destroying the queue. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 5c1e7cb7fe0d..bdfc22eb2a10 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1641,6 +1641,16 @@ static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) spin_lock_irqsave(&queue->state_lock, flags); switch (queue->state) { case NVMET_RDMA_Q_CONNECTING: + while (!list_empty(&queue->rsp_wait_list)) { + struct nvmet_rdma_rsp *rsp; + + rsp = list_first_entry(&queue->rsp_wait_list, + struct nvmet_rdma_rsp, + wait_list); + list_del(&rsp->wait_list); + nvmet_rdma_put_rsp(rsp); + } + fallthrough; case NVMET_RDMA_Q_LIVE: queue->state = NVMET_RDMA_Q_DISCONNECTING; disconnect = true; From 2b59787a223b79228fed9ade1bf6936194ddb8cd Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 5 Jan 2021 10:34:02 +0000 Subject: [PATCH 398/547] nvme: remove the unused status argument from nvme_trace_bio_complete The only used argument in this function is the "req". Signed-off-by: Max Gurtovoy Reviewed-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/nvme.h | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 70a63d7c1d02..f320273fc672 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -330,7 +330,7 @@ static inline void nvme_end_req(struct request *req) req->__sector = nvme_lba_to_sect(req->q->queuedata, le64_to_cpu(nvme_req(req)->result.u64)); - nvme_trace_bio_complete(req, status); + nvme_trace_bio_complete(req); blk_mq_end_request(req, status); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9c4fbfe44c00..88a6b97247f5 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -672,8 +672,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) kblockd_schedule_work(&head->requeue_work); } -static inline void nvme_trace_bio_complete(struct request *req, - blk_status_t status) +static inline void nvme_trace_bio_complete(struct request *req) { struct nvme_ns *ns = req->q->queuedata; @@ -728,8 +727,7 @@ static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { } -static inline void nvme_trace_bio_complete(struct request *req, - blk_status_t status) +static inline void nvme_trace_bio_complete(struct request *req) { } static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, From 3ce47d95b7346dcafd9bed3556a8d072cb2b8571 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 4 Jan 2021 13:59:53 -0700 Subject: [PATCH 399/547] powerpc: Handle .text.{hot,unlikely}.* in linker script Commit eff8728fe698 ("vmlinux.lds.h: Add PGO and AutoFDO input sections") added ".text.unlikely.*" and ".text.hot.*" due to an LLVM change [1]. After another LLVM change [2], these sections are seen in some PowerPC builds, where there is a orphan section warning then build failure: $ make -skj"$(nproc)" \ ARCH=powerpc CROSS_COMPILE=powerpc64le-linux-gnu- LLVM=1 O=out \ distclean powernv_defconfig zImage.epapr ld.lld: warning: kernel/built-in.a(panic.o):(.text.unlikely.) is being placed in '.text.unlikely.' ... ld.lld: warning: address (0xc000000000009314) of section .text is not a multiple of alignment (256) ... ERROR: start_text address is c000000000009400, should be c000000000008000 ERROR: try to enable LD_HEAD_STUB_CATCH config option ERROR: see comments in arch/powerpc/tools/head_check.sh ... Explicitly handle these sections like in the main linker script so there is no more build failure. [1]: https://reviews.llvm.org/D79600 [2]: https://reviews.llvm.org/D92493 Fixes: 83a092cf95f2 ("powerpc: Link warning for orphan sections") Cc: stable@vger.kernel.org Signed-off-by: Nathan Chancellor Signed-off-by: Michael Ellerman Link: https://github.com/ClangBuiltLinux/linux/issues/1218 Link: https://lore.kernel.org/r/20210104205952.1399409-1-natechancellor@gmail.com --- arch/powerpc/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 0318ba436f34..8e0b1298bf19 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -85,7 +85,7 @@ SECTIONS ALIGN_FUNCTION(); #endif /* careful! __ftr_alt_* sections need to be close to .text */ - *(.text.hot TEXT_MAIN .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text); + *(.text.hot .text.hot.* TEXT_MAIN .text.fixup .text.unlikely .text.unlikely.* .fixup __ftr_alt_* .ref.text); #ifdef CONFIG_PPC64 *(.tramp.ftrace.text); #endif From cb7f4a8b1fb426a175d1708f05581939c61329d4 Mon Sep 17 00:00:00 2001 From: Ying-Tsun Huang Date: Tue, 15 Dec 2020 15:07:20 +0800 Subject: [PATCH 400/547] x86/mtrr: Correct the range check before performing MTRR type lookups In mtrr_type_lookup(), if the input memory address region is not in the MTRR, over 4GB, and not over the top of memory, a write-back attribute is returned. These condition checks are for ensuring the input memory address region is actually mapped to the physical memory. However, if the end address is just aligned with the top of memory, the condition check treats the address is over the top of memory, and write-back attribute is not returned. And this hits in a real use case with NVDIMM: the nd_pmem module tries to map NVDIMMs as cacheable memories when NVDIMMs are connected. If a NVDIMM is the last of the DIMMs, the performance of this NVDIMM becomes very low since it is aligned with the top of memory and its memory type is uncached-minus. Move the input end address change to inclusive up into mtrr_type_lookup(), before checking for the top of memory in either mtrr_type_lookup_{variable,fixed}() helpers. [ bp: Massage commit message. ] Fixes: 0cc705f56e40 ("x86/mm/mtrr: Clean up mtrr_type_lookup()") Signed-off-by: Ying-Tsun Huang Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20201215070721.4349-1-ying-tsun.huang@amd.com --- arch/x86/kernel/cpu/mtrr/generic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 23ad8e953dfb..a29997e6cf9e 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -167,9 +167,6 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end, *repeat = 0; *uniform = 1; - /* Make end inclusive instead of exclusive */ - end--; - prev_match = MTRR_TYPE_INVALID; for (i = 0; i < num_var_ranges; ++i) { unsigned short start_state, end_state, inclusive; @@ -261,6 +258,9 @@ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) int repeat; u64 partial_end; + /* Make end inclusive instead of exclusive */ + end--; + if (!mtrr_state_set) return MTRR_TYPE_INVALID; From 3e2224c5867fead6c0b94b84727cc676ac6353a3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 6 Jan 2021 16:09:26 +0000 Subject: [PATCH 401/547] io_uring: Fix return value from alloc_fixed_file_ref_node alloc_fixed_file_ref_node() currently returns an ERR_PTR on failure. io_sqe_files_unregister() expects it to return NULL and since it can only return -ENOMEM, it makes more sense to change alloc_fixed_file_ref_node() to behave that way. Fixes: 1ffc54220c44 ("io_uring: fix io_sqe_files_unregister() hangs") Reported-by: Dan Carpenter Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index dc92ca5090a3..27a8c226abf8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7696,12 +7696,12 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node( ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); if (!ref_node) - return ERR_PTR(-ENOMEM); + return NULL; if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero, 0, GFP_KERNEL)) { kfree(ref_node); - return ERR_PTR(-ENOMEM); + return NULL; } INIT_LIST_HEAD(&ref_node->node); INIT_LIST_HEAD(&ref_node->file_list); @@ -7795,9 +7795,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, } ref_node = alloc_fixed_file_ref_node(ctx); - if (IS_ERR(ref_node)) { + if (!ref_node) { io_sqe_files_unregister(ctx); - return PTR_ERR(ref_node); + return -ENOMEM; } io_sqe_files_set_node(file_data, ref_node); @@ -7897,8 +7897,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, return -EINVAL; ref_node = alloc_fixed_file_ref_node(ctx); - if (IS_ERR(ref_node)) - return PTR_ERR(ref_node); + if (!ref_node) + return -ENOMEM; done = 0; fds = u64_to_user_ptr(up->fds); From 00b8c557d096f0930d5c07df618223d3d06902d6 Mon Sep 17 00:00:00 2001 From: Matthias Maennich Date: Wed, 6 Jan 2021 15:52:01 +0000 Subject: [PATCH 402/547] staging: ION: remove some references to CONFIG_ION With commit e722a295cf49 ("staging: ion: remove from the tree"), ION and its corresponding config CONFIG_ION is gone. Remove stale references from drivers/staging/media/atomisp/pci and from the recommended Android kernel config. Fixes: e722a295cf49 ("staging: ion: remove from the tree") Cc: Hridya Valsaraju Cc: Rob Herring Cc: linux-media@vger.kernel.org Cc: devel@driverdev.osuosl.org Signed-off-by: Matthias Maennich Link: https://lore.kernel.org/r/20210106155201.2845319-1-maennich@google.com Signed-off-by: Greg Kroah-Hartman --- .../media/atomisp/pci/atomisp_subdev.c | 20 ------------------- kernel/configs/android-recommended.config | 1 - 2 files changed, 21 deletions(-) diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.c b/drivers/staging/media/atomisp/pci/atomisp_subdev.c index 52b9fb18c87f..b666cb23e5ca 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.c +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.c @@ -1062,26 +1062,6 @@ static const struct v4l2_ctrl_config ctrl_select_isp_version = { .def = 0, }; -#if 0 /* #ifdef CONFIG_ION */ -/* - * Control for ISP ion device fd - * - * userspace will open ion device and pass the fd to kernel. - * this fd will be used to map shared fd to buffer. - */ -/* V4L2_CID_ATOMISP_ION_DEVICE_FD is not defined */ -static const struct v4l2_ctrl_config ctrl_ion_dev_fd = { - .ops = &ctrl_ops, - .id = V4L2_CID_ATOMISP_ION_DEVICE_FD, - .type = V4L2_CTRL_TYPE_INTEGER, - .name = "Ion Device Fd", - .min = -1, - .max = 1024, - .step = 1, - .def = ION_FD_UNSET -}; -#endif - static void atomisp_init_subdev_pipe(struct atomisp_sub_device *asd, struct atomisp_video_pipe *pipe, enum v4l2_buf_type buf_type) { diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 53d688bdd894..eb0029c9a6a6 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -81,7 +81,6 @@ CONFIG_INPUT_JOYSTICK=y CONFIG_INPUT_MISC=y CONFIG_INPUT_TABLET=y CONFIG_INPUT_UINPUT=y -CONFIG_ION=y CONFIG_JOYSTICK_XPAD=y CONFIG_JOYSTICK_XPAD_FF=y CONFIG_JOYSTICK_XPAD_LEDS=y From 0e61f09af48beb41be0954e7be7d3ba2d18c9946 Mon Sep 17 00:00:00 2001 From: Xiaojian Du Date: Mon, 14 Dec 2020 17:05:55 +0800 Subject: [PATCH 403/547] drm/amd/pm: correct the sensor value of power for vangogh This patch is to correct the sensor value of power for vangogh. Signed-off-by: Xiaojian Du Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c index 8cb4fcee9a2c..5c1482d4ca43 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c @@ -252,7 +252,8 @@ static int vangogh_get_smu_metrics_data(struct smu_context *smu, *value = metrics->UvdActivity; break; case METRICS_AVERAGE_SOCKETPOWER: - *value = metrics->CurrentSocketPower; + *value = (metrics->CurrentSocketPower << 8) / + 1000 ; break; case METRICS_TEMPERATURE_EDGE: *value = metrics->GfxTemperature / 100 * From 37030aba0f362cf8b16eb2347c7430b2e9ef719e Mon Sep 17 00:00:00 2001 From: Xiaojian Du Date: Fri, 18 Dec 2020 14:32:02 +0800 Subject: [PATCH 404/547] drm/amd/pm: improve the fine grain tuning function for RV/RV2/PCO This patch is to improve the fine grain tuning function for RV/RV2/PCO. This patch adds two new commands: "restore" and "commit". This function uses the pp_od_clk_voltage sysfs file to configure the min and max value of gfx clock frequency manually or restore the default value. Command guide: echo "s level value" > pp_od_clk_voltage "s" - set the sclk frequency "level" - 0 or 1, "0" represents the min value, "1" represents the max value "value" - the target value of sclk frequency, it should be limited in the safe range echo "r" > pp_od_clk_voltage "r" - reset the sclk frequency, restore the default value instantly echo "c" > pp_od_clk_voltage "c" - commit the min and max value of sclk frequency to the system only after the commit command, the target values set by "s" command will take effect. Example: 1)change power profile from "auto" to "manual" $ cat power_dpm_force_performance_level auto $ echo "manual" > power_dpm_force_performance_level $ cat power_dpm_force_performance_level manual 2)check the default sclk frequency $ cat pp_od_clk_voltage OD_SCLK: 0: 200Mhz 1: 1400Mhz OD_RANGE: SCLK: 200MHz 1400MHz 3)use "s" -- set command to configure the min and max sclk frequency $ echo "s 0 600" > pp_od_clk_voltage $ echo "s 1 1000" > pp_od_clk_voltage $ echo "c" > pp_od_clk_voltage $ cat pp_od_clk_voltage OD_SCLK: 0: 600Mhz 1: 1000Mhz OD_RANGE: SCLK: 200MHz 1400MHz 4)use "r" -- reset command to restore the min or max sclk frequency $ echo "r" > pp_od_clk_voltage $ cat pp_od_clk_voltage OD_SCLK: 0: 200Mhz 1: 1400Mhz OD_RANGE: SCLK: 200MHz 1400MHz Signed-off-by: Xiaojian Du Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- .../drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c | 114 +++++++++++++++--- .../drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.h | 1 + 2 files changed, 98 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c index e57e64bbacdc..0cf899566e31 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c @@ -251,7 +251,7 @@ static int smu10_set_hard_min_gfxclk_by_freq(struct pp_hwmgr *hwmgr, uint32_t cl smu10_data->gfx_actual_soft_min_freq = clock; smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetHardMinGfxClk, - smu10_data->gfx_actual_soft_min_freq, + clock, NULL); } return 0; @@ -948,6 +948,8 @@ static int smu10_print_clock_levels(struct pp_hwmgr *hwmgr, struct smu10_voltage_dependency_table *mclk_table = data->clock_vol_info.vdd_dep_on_fclk; uint32_t i, now, size = 0; + uint32_t min_freq, max_freq = 0; + uint32_t ret = 0; switch (type) { case PP_SCLK: @@ -983,18 +985,28 @@ static int smu10_print_clock_levels(struct pp_hwmgr *hwmgr, break; case OD_SCLK: if (hwmgr->od_enabled) { - size = sprintf(buf, "%s:\n", "OD_SCLK"); + ret = smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMinGfxclkFrequency, &min_freq); + if (ret) + return ret; + ret = smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMaxGfxclkFrequency, &max_freq); + if (ret) + return ret; + size = sprintf(buf, "%s:\n", "OD_SCLK"); size += sprintf(buf + size, "0: %10uMhz\n", - (data->gfx_actual_soft_min_freq > 0) ? data->gfx_actual_soft_min_freq : data->gfx_min_freq_limit/100); - size += sprintf(buf + size, "1: %10uMhz\n", data->gfx_max_freq_limit/100); + (data->gfx_actual_soft_min_freq > 0) ? data->gfx_actual_soft_min_freq : min_freq); + size += sprintf(buf + size, "1: %10uMhz\n", + (data->gfx_actual_soft_max_freq > 0) ? data->gfx_actual_soft_max_freq : max_freq); } break; case OD_RANGE: if (hwmgr->od_enabled) { - uint32_t min_freq, max_freq = 0; - smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMinGfxclkFrequency, &min_freq); - smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMaxGfxclkFrequency, &max_freq); + ret = smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMinGfxclkFrequency, &min_freq); + if (ret) + return ret; + ret = smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMaxGfxclkFrequency, &max_freq); + if (ret) + return ret; size = sprintf(buf, "%s:\n", "OD_RANGE"); size += sprintf(buf + size, "SCLK: %7uMHz %10uMHz\n", @@ -1414,23 +1426,91 @@ static int smu10_set_fine_grain_clk_vol(struct pp_hwmgr *hwmgr, enum PP_OD_DPM_TABLE_COMMAND type, long *input, uint32_t size) { + uint32_t min_freq, max_freq = 0; + struct smu10_hwmgr *smu10_data = (struct smu10_hwmgr *)(hwmgr->backend); + int ret = 0; + if (!hwmgr->od_enabled) { pr_err("Fine grain not support\n"); return -EINVAL; } - if (size != 2) { - pr_err("Input parameter number not correct\n"); - return -EINVAL; - } - if (type == PP_OD_EDIT_SCLK_VDDC_TABLE) { - if (input[0] == 0) - smu10_set_hard_min_gfxclk_by_freq(hwmgr, input[1]); - else if (input[0] == 1) - smu10_set_soft_max_gfxclk_by_freq(hwmgr, input[1]); - else + if (size != 2) { + pr_err("Input parameter number not correct\n"); return -EINVAL; + } + + if (input[0] == 0) { + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMinGfxclkFrequency, &min_freq); + if (input[1] < min_freq) { + pr_err("Fine grain setting minimum sclk (%ld) MHz is less than the minimum allowed (%d) MHz\n", + input[1], min_freq); + return -EINVAL; + } + smu10_data->gfx_actual_soft_min_freq = input[1]; + } else if (input[0] == 1) { + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMaxGfxclkFrequency, &max_freq); + if (input[1] > max_freq) { + pr_err("Fine grain setting maximum sclk (%ld) MHz is greater than the maximum allowed (%d) MHz\n", + input[1], max_freq); + return -EINVAL; + } + smu10_data->gfx_actual_soft_max_freq = input[1]; + } else { + return -EINVAL; + } + } else if (type == PP_OD_RESTORE_DEFAULT_TABLE) { + if (size != 0) { + pr_err("Input parameter number not correct\n"); + return -EINVAL; + } + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMinGfxclkFrequency, &min_freq); + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMaxGfxclkFrequency, &max_freq); + + smu10_data->gfx_actual_soft_min_freq = min_freq; + smu10_data->gfx_actual_soft_max_freq = max_freq; + + ret = smum_send_msg_to_smc_with_parameter(hwmgr, + PPSMC_MSG_SetHardMinGfxClk, + min_freq, + NULL); + if (ret) + return ret; + + ret = smum_send_msg_to_smc_with_parameter(hwmgr, + PPSMC_MSG_SetSoftMaxGfxClk, + max_freq, + NULL); + if (ret) + return ret; + } else if (type == PP_OD_COMMIT_DPM_TABLE) { + if (size != 0) { + pr_err("Input parameter number not correct\n"); + return -EINVAL; + } + + if (smu10_data->gfx_actual_soft_min_freq > smu10_data->gfx_actual_soft_max_freq) { + pr_err("The setting minimun sclk (%d) MHz is greater than the setting maximum sclk (%d) MHz\n", + smu10_data->gfx_actual_soft_min_freq, smu10_data->gfx_actual_soft_max_freq); + return -EINVAL; + } + + ret = smum_send_msg_to_smc_with_parameter(hwmgr, + PPSMC_MSG_SetHardMinGfxClk, + smu10_data->gfx_actual_soft_min_freq, + NULL); + if (ret) + return ret; + + ret = smum_send_msg_to_smc_with_parameter(hwmgr, + PPSMC_MSG_SetSoftMaxGfxClk, + smu10_data->gfx_actual_soft_max_freq, + NULL); + if (ret) + return ret; + } else { + return -EINVAL; } return 0; diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.h b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.h index 6c9b5f060902..28d86d354d50 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.h +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.h @@ -283,6 +283,7 @@ struct smu10_hwmgr { uint32_t vclk_soft_min; uint32_t dclk_soft_min; uint32_t gfx_actual_soft_min_freq; + uint32_t gfx_actual_soft_max_freq; uint32_t gfx_min_freq_limit; uint32_t gfx_max_freq_limit; /* in 10Khz*/ From fc996f952df1c63b57e3a08ac612db53bf8abadc Mon Sep 17 00:00:00 2001 From: John Clements Date: Fri, 25 Dec 2020 12:22:51 +0800 Subject: [PATCH 405/547] drm/amd/pm: updated PM to I2C controller port on sienna cichlid sienna cichlid interfaces with RAS eeprom on I2C controller port 1 Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index 9608745d732f..12b36eb0ff6a 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -2372,7 +2372,7 @@ static void sienna_cichlid_fill_i2c_req(SwI2cRequest_t *req, bool write, { int i; - req->I2CcontrollerPort = 0; + req->I2CcontrollerPort = 1; req->I2CSpeed = 2; req->SlaveAddress = address; req->NumCmds = numbytes; From a7b5d9dd57298333e6e9f4c167f01385d922bbfb Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Tue, 29 Dec 2020 14:10:28 +0800 Subject: [PATCH 406/547] drm/amd/display: fix sysfs amdgpu_current_backlight_pwm NULL pointer issue fix NULL pointer issue when read sysfs amdgpu_current_backlight_pwm sysfs node. Call Trace: [ 248.273833] BUG: kernel NULL pointer dereference, address: 0000000000000130 [ 248.273930] #PF: supervisor read access in kernel mode [ 248.273993] #PF: error_code(0x0000) - not-present page [ 248.274054] PGD 0 P4D 0 [ 248.274092] Oops: 0000 [#1] SMP PTI [ 248.274138] CPU: 2 PID: 1377 Comm: cat Tainted: G OE 5.9.0-rc5-drm-next-5.9+ #1 [ 248.274233] Hardware name: System manufacturer System Product Name/Z170-A, BIOS 3802 03/15/2018 [ 248.274641] RIP: 0010:dc_link_get_backlight_level+0x5/0x70 [amdgpu] [ 248.274718] Code: 67 ff ff ff 41 b9 03 00 00 00 e9 45 ff ff ff d1 ea e9 55 ff ff ff 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 <48> 8b 87 30 01 00 00 48 8b 00 48 8b 88 88 03 00 00 48 8d 81 e8 01 [ 248.274919] RSP: 0018:ffffb5ad809b3df0 EFLAGS: 00010203 [ 248.274982] RAX: ffffa0f77d1c0010 RBX: ffffa0f793ae9168 RCX: 0000000000000001 [ 248.275064] RDX: ffffa0f79753db00 RSI: 0000000000000001 RDI: 0000000000000000 [ 248.275145] RBP: ffffb5ad809b3e00 R08: ffffb5ad809b3da0 R09: 0000000000000000 [ 248.275225] R10: ffffb5ad809b3e68 R11: 0000000000000000 R12: ffffa0f793ae9190 [ 248.275306] R13: ffffb5ad809b3ef0 R14: 0000000000000001 R15: ffffa0f793ae9168 [ 248.275388] FS: 00007f5f1ec4d540(0000) GS:ffffa0f79ec80000(0000) knlGS:0000000000000000 [ 248.275480] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 248.275547] CR2: 0000000000000130 CR3: 000000042a03c005 CR4: 00000000003706e0 [ 248.275628] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 248.275708] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 248.275789] Call Trace: [ 248.276124] ? current_backlight_read+0x24/0x40 [amdgpu] [ 248.276194] seq_read+0xc3/0x3f0 [ 248.276240] full_proxy_read+0x5c/0x90 [ 248.276290] vfs_read+0xa7/0x190 [ 248.276334] ksys_read+0xa7/0xe0 [ 248.276379] __x64_sys_read+0x1a/0x20 [ 248.276429] do_syscall_64+0x37/0x80 [ 248.276477] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 248.276538] RIP: 0033:0x7f5f1e75c191 [ 248.276585] Code: fe ff ff 48 8d 3d b7 9d 0a 00 48 83 ec 08 e8 46 4d 02 00 66 0f 1f 44 00 00 48 8d 05 71 07 2e 00 8b 00 85 c0 75 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 57 f3 c3 0f 1f 44 00 00 41 54 55 49 89 d4 53Hw [ 248.276784] RSP: 002b:00007ffcb1fc3f38 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [ 248.276872] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f5f1e75c191 [ 248.276953] RDX: 0000000000020000 RSI: 00007f5f1ec2b000 RDI: 0000000000000003 [ 248.277034] RBP: 0000000000020000 R08: 00000000ffffffff R09: 0000000000000000 [ 248.277115] R10: 0000000000000022 R11: 0000000000000246 R12: 00007f5f1ec2b000 [ 248.277195] R13: 0000000000000003 R14: 00007f5f1ec2b00f R15: 0000000000020000 [ 248.277279] Modules linked in: amdgpu(OE) iommu_v2 gpu_sched ttm(OE) drm_kms_helper cec drm i2c_algo_bit fb_sys_fops syscopyarea sysfillrect sysimgblt rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace fscache nls_iso8859_1 snd_hda_codec_realtek snd_hda_codec_hdmi snd_hda_codec_generic ledtrig_audio intel_rapl_msr intel_rapl_common snd_hda_intel snd_intel_dspcfg x86_pkg_temp_thermal intel_powerclamp snd_hda_codec snd_hda_core snd_hwdep snd_pcm snd_seq_midi snd_seq_midi_event mei_hdcp coretemp snd_rawmidi snd_seq kvm_intel kvm snd_seq_device snd_timer irqbypass joydev snd input_leds soundcore crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel crypto_simd cryptd glue_helper rapl intel_cstate mac_hid mei_me serio_raw mei eeepc_wmi wmi_bmof asus_wmi mxm_wmi intel_wmi_thunderbolt acpi_pad sparse_keymap efi_pstore sch_fq_codel parport_pc ppdev lp parport sunrpc ip_tables x_tables autofs4 hid_logitech_hidpp hid_logitech_dj hid_generic usbhid hid e1000e psmouse ahci libahci wmi video [ 248.278211] CR2: 0000000000000130 [ 248.278221] ---[ end trace 1fbe72fe6f91091d ]--- [ 248.357226] RIP: 0010:dc_link_get_backlight_level+0x5/0x70 [amdgpu] [ 248.357272] Code: 67 ff ff ff 41 b9 03 00 00 00 e9 45 ff ff ff d1 ea e9 55 ff ff ff 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 <48> 8b 87 30 01 00 00 48 8b 00 48 8b 88 88 03 00 00 48 8d 81 e8 01 Signed-off-by: Kevin Wang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_link.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c b/drivers/gpu/drm/amd/display/dc/core/dc_link.c index 9e1071b2181f..f4a2088ab179 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c @@ -2487,9 +2487,14 @@ enum dc_status dc_link_validate_mode_timing( static struct abm *get_abm_from_stream_res(const struct dc_link *link) { int i; - struct dc *dc = link->ctx->dc; + struct dc *dc = NULL; struct abm *abm = NULL; + if (!link || !link->ctx) + return NULL; + + dc = link->ctx->dc; + for (i = 0; i < MAX_PIPES; i++) { struct pipe_ctx pipe_ctx = dc->current_state->res_ctx.pipe_ctx[i]; struct dc_stream_state *stream = pipe_ctx.stream; From ed1df58585632dff96cc01e14857175dfdf67376 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 31 Dec 2020 13:05:09 +0800 Subject: [PATCH 407/547] drm/amdgpu: switched to cached noretry setting for vangogh global noretry setting is cached to gmc.noretry Signed-off-by: Hawking Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c index b72c8e4ca36b..07104a1de308 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_3.c @@ -310,7 +310,7 @@ static void mmhub_v2_3_setup_vmid_config(struct amdgpu_device *adev) /* Send no-retry XNACK on fault to suppress VM fault storm. */ tmp = REG_SET_FIELD(tmp, MMVM_CONTEXT1_CNTL, RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, - !amdgpu_noretry); + !adev->gmc.noretry); WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_CNTL, i * hub->ctx_distance, tmp); WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, From 9a029a3facc4d333100308a8e283d9210a36b94c Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Wed, 30 Dec 2020 10:27:42 +0800 Subject: [PATCH 408/547] drm/amdgpu: fix a memory protection fault when remove amdgpu device ASD and TA share the same firmware in SIENNA_CICHLID and only TA firmware is requested during boot, so only need release TA firmware when remove device. [ 83.877150] general protection fault, probably for non-canonical address 0x1269f97e6ed04095: 0000 [#1] SMP PTI [ 83.888076] CPU: 0 PID: 1312 Comm: modprobe Tainted: G W OE 5.9.0-rc5-deli-amd-vangogh-0.0.6.6-114-gdd99d5669a96-dirty #2 [ 83.901160] Hardware name: System manufacturer System Product Name/TUF Z370-PLUS GAMING II, BIOS 0411 09/21/2018 [ 83.912353] RIP: 0010:free_fw_priv+0xc/0x120 [ 83.917531] Code: e8 99 cd b0 ff b8 a1 ff ff ff eb 9f 4c 89 f7 e8 8a cd b0 ff b8 f4 ff ff ff eb 90 0f 1f 00 0f 1f 44 00 00 55 48 89 e5 41 54 53 <4c> 8b 67 18 48 89 fb 4c 89 e7 e8 45 94 41 00 b8 ff ff ff ff f0 0f [ 83.937576] RSP: 0018:ffffbc34c13a3ce0 EFLAGS: 00010206 [ 83.943699] RAX: ffffffffbb681850 RBX: ffffa047f117eb60 RCX: 0000000080800055 [ 83.951879] RDX: ffffbc34c1d5f000 RSI: 0000000080800055 RDI: 1269f97e6ed04095 [ 83.959955] RBP: ffffbc34c13a3cf0 R08: 0000000000000000 R09: 0000000000000001 [ 83.968107] R10: ffffbc34c13a3cc8 R11: 00000000ffffff00 R12: ffffa047d6b23378 [ 83.976166] R13: ffffa047d6b23338 R14: ffffa047d6b240c8 R15: 0000000000000000 [ 83.984295] FS: 00007f74f6712540(0000) GS:ffffa047fbe00000(0000) knlGS:0000000000000000 [ 83.993323] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 84.000056] CR2: 0000556a1cca4e18 CR3: 000000021faa8004 CR4: 00000000003706f0 [ 84.008128] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 84.016155] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 84.024174] Call Trace: [ 84.027514] release_firmware.part.11+0x4b/0x70 [ 84.033017] release_firmware+0x13/0x20 [ 84.037803] psp_sw_fini+0x77/0xb0 [amdgpu] [ 84.042857] amdgpu_device_fini+0x38c/0x5d0 [amdgpu] [ 84.048815] amdgpu_driver_unload_kms+0x43/0x70 [amdgpu] [ 84.055055] drm_dev_unregister+0x73/0xb0 [drm] [ 84.060499] drm_dev_unplug+0x28/0x30 [drm] [ 84.065598] amdgpu_dev_uninit+0x1b/0x40 [amdgpu] [ 84.071223] amdgpu_pci_remove+0x4e/0x70 [amdgpu] [ 84.076835] pci_device_remove+0x3e/0xc0 [ 84.081609] device_release_driver_internal+0xfb/0x1c0 [ 84.087558] driver_detach+0x4d/0xa0 [ 84.092041] bus_remove_driver+0x5f/0xe0 [ 84.096854] driver_unregister+0x2f/0x50 [ 84.101594] pci_unregister_driver+0x22/0xa0 [ 84.106806] amdgpu_exit+0x15/0x2b [amdgpu] Signed-off-by: Dennis Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 523d22db094b..5d6fc369e32c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -563,7 +563,7 @@ static int psp_asd_load(struct psp_context *psp) * add workaround to bypass it for sriov now. * TODO: add version check to make it common */ - if (amdgpu_sriov_vf(psp->adev) || !psp->asd_fw) + if (amdgpu_sriov_vf(psp->adev) || !psp->asd_ucode_size) return 0; cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL); @@ -2589,11 +2589,10 @@ static int parse_ta_bin_descriptor(struct psp_context *psp, switch (desc->fw_type) { case TA_FW_TYPE_PSP_ASD: - psp->asd_fw_version = le32_to_cpu(desc->fw_version); + psp->asd_fw_version = le32_to_cpu(desc->fw_version); psp->asd_feature_version = le32_to_cpu(desc->fw_version); - psp->asd_ucode_size = le32_to_cpu(desc->size_bytes); + psp->asd_ucode_size = le32_to_cpu(desc->size_bytes); psp->asd_start_addr = ucode_start_addr; - psp->asd_fw = psp->ta_fw; break; case TA_FW_TYPE_PSP_XGMI: psp->ta_xgmi_ucode_version = le32_to_cpu(desc->fw_version); From 88e21af1b3f887d217f2fb14fc7e7d3cd87ebf57 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Wed, 30 Dec 2020 19:45:15 +0800 Subject: [PATCH 409/547] drm/amdgpu: fix a GPU hang issue when remove device When GFXOFF is enabled and GPU is idle, driver will fail to access some registers. Therefore change to disable power gating before all access registers with MMIO. Dmesg log is as following: amdgpu 0000:03:00.0: amdgpu: amdgpu: finishing device. amdgpu: cp queue pipe 4 queue 0 preemption failed amdgpu 0000:03:00.0: amdgpu: failed to write reg 2890 wait reg 28a2 amdgpu 0000:03:00.0: amdgpu: failed to write reg 1a6f4 wait reg 1a706 amdgpu 0000:03:00.0: amdgpu: failed to write reg 2890 wait reg 28a2 amdgpu 0000:03:00.0: amdgpu: failed to write reg 1a6f4 wait reg 1a706 Signed-off-by: Dennis Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1cb7d73f7317..b69c34074d8d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2548,11 +2548,11 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_remove_device(adev); - amdgpu_amdkfd_device_fini(adev); - amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); + amdgpu_amdkfd_device_fini(adev); + /* need to disable SMC first */ for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.hw) From 44cb39e19a05ca711bcb6e776e0a4399223204a0 Mon Sep 17 00:00:00 2001 From: Xiaojian Du Date: Wed, 30 Dec 2020 18:08:23 +0800 Subject: [PATCH 410/547] drm/amd/pm: fix the failure when change power profile for renoir This patch is to fix the failure when change power profile to "profile_peak" for renoir. Signed-off-by: Xiaojian Du Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c | 1 + drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c index dc75db8af371..f743685a20e8 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c @@ -188,6 +188,7 @@ static int renoir_get_dpm_clk_limited(struct smu_context *smu, enum smu_clk_type return -EINVAL; *freq = clk_table->SocClocks[dpm_level].Freq; break; + case SMU_UCLK: case SMU_MCLK: if (dpm_level >= NUM_FCLK_DPM_LEVELS) return -EINVAL; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c index 522d55004655..06abf2a7ce9e 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c @@ -225,6 +225,7 @@ int smu_v12_0_set_soft_freq_limited_range(struct smu_context *smu, enum smu_clk_ break; case SMU_FCLK: case SMU_MCLK: + case SMU_UCLK: ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetHardMinFclkByFreq, min, NULL); if (ret) return ret; From 98b64762080b96b0f8608da5fe161f1a7ab6f5de Mon Sep 17 00:00:00 2001 From: Xiaojian Du Date: Tue, 29 Dec 2020 17:19:37 +0800 Subject: [PATCH 411/547] drm/amd/pm: improve the fine grain tuning function for RV/RV2/PCO This patch is to improve the fine grain tuning function for RV/RV2/PCO. The fine grain tuning function uses the sysfs node -- pp_od_clk_voltage to config gfxclk. Meanwhile, another sysfs node -- power_dpm_force_perfomance_level also affects the gfx clk. It will cause confusion when these two sysfs nodes works together. So this patch adds one flag to avoid this confusion, the flag will make these two sysfs nodes work separately. The flag is set as "disabled" by default, so the fine grain tuning function will be disabled by default. Only when power_dpm_force_perfomance_level is changed to "manual" mode, the flag will be set as "enabled", and the fine grain tuning function will be enabled. In other profile modes, including "auto", "high", "low", "profile_peak", "profile_standard", "profile_min_sclk", "profile_min_mclk", the flag will be set as "disabled", and the od range of fine grain tuning function will be restored default value. Signed-off-by: Xiaojian Du Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- .../drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c | 58 ++++++++++++++++++- .../drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.h | 2 + 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c index 0cf899566e31..88322781e447 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.c @@ -558,7 +558,8 @@ static int smu10_hwmgr_backend_init(struct pp_hwmgr *hwmgr) /* enable the pp_od_clk_voltage sysfs file */ hwmgr->od_enabled = 1; - + /* disabled fine grain tuning function by default */ + data->fine_grain_enabled = 0; return result; } @@ -597,6 +598,7 @@ static int smu10_dpm_force_dpm_level(struct pp_hwmgr *hwmgr, uint32_t min_mclk = hwmgr->display_config->min_mem_set_clock/100; uint32_t index_fclk = data->clock_vol_info.vdd_dep_on_fclk->count - 1; uint32_t index_socclk = data->clock_vol_info.vdd_dep_on_socclk->count - 1; + uint32_t fine_grain_min_freq = 0, fine_grain_max_freq = 0; if (hwmgr->smu_version < 0x1E3700) { pr_info("smu firmware version too old, can not set dpm level\n"); @@ -613,6 +615,14 @@ static int smu10_dpm_force_dpm_level(struct pp_hwmgr *hwmgr, switch (level) { case AMD_DPM_FORCED_LEVEL_HIGH: case AMD_DPM_FORCED_LEVEL_PROFILE_PEAK: + data->fine_grain_enabled = 0; + + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMinGfxclkFrequency, &fine_grain_min_freq); + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMaxGfxclkFrequency, &fine_grain_max_freq); + + data->gfx_actual_soft_min_freq = fine_grain_min_freq; + data->gfx_actual_soft_max_freq = fine_grain_max_freq; + smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetHardMinGfxClk, data->gfx_max_freq_limit/100, @@ -648,6 +658,14 @@ static int smu10_dpm_force_dpm_level(struct pp_hwmgr *hwmgr, NULL); break; case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_SCLK: + data->fine_grain_enabled = 0; + + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMinGfxclkFrequency, &fine_grain_min_freq); + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMaxGfxclkFrequency, &fine_grain_max_freq); + + data->gfx_actual_soft_min_freq = fine_grain_min_freq; + data->gfx_actual_soft_max_freq = fine_grain_max_freq; + smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetHardMinGfxClk, min_sclk, @@ -658,6 +676,14 @@ static int smu10_dpm_force_dpm_level(struct pp_hwmgr *hwmgr, NULL); break; case AMD_DPM_FORCED_LEVEL_PROFILE_MIN_MCLK: + data->fine_grain_enabled = 0; + + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMinGfxclkFrequency, &fine_grain_min_freq); + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMaxGfxclkFrequency, &fine_grain_max_freq); + + data->gfx_actual_soft_min_freq = fine_grain_min_freq; + data->gfx_actual_soft_max_freq = fine_grain_max_freq; + smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetHardMinFclkByFreq, min_mclk, @@ -668,6 +694,14 @@ static int smu10_dpm_force_dpm_level(struct pp_hwmgr *hwmgr, NULL); break; case AMD_DPM_FORCED_LEVEL_PROFILE_STANDARD: + data->fine_grain_enabled = 0; + + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMinGfxclkFrequency, &fine_grain_min_freq); + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMaxGfxclkFrequency, &fine_grain_max_freq); + + data->gfx_actual_soft_min_freq = fine_grain_min_freq; + data->gfx_actual_soft_max_freq = fine_grain_max_freq; + smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetHardMinGfxClk, SMU10_UMD_PSTATE_GFXCLK, @@ -703,6 +737,14 @@ static int smu10_dpm_force_dpm_level(struct pp_hwmgr *hwmgr, NULL); break; case AMD_DPM_FORCED_LEVEL_AUTO: + data->fine_grain_enabled = 0; + + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMinGfxclkFrequency, &fine_grain_min_freq); + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMaxGfxclkFrequency, &fine_grain_max_freq); + + data->gfx_actual_soft_min_freq = fine_grain_min_freq; + data->gfx_actual_soft_max_freq = fine_grain_max_freq; + smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetHardMinGfxClk, min_sclk, @@ -741,6 +783,14 @@ static int smu10_dpm_force_dpm_level(struct pp_hwmgr *hwmgr, NULL); break; case AMD_DPM_FORCED_LEVEL_LOW: + data->fine_grain_enabled = 0; + + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMinGfxclkFrequency, &fine_grain_min_freq); + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetMaxGfxclkFrequency, &fine_grain_max_freq); + + data->gfx_actual_soft_min_freq = fine_grain_min_freq; + data->gfx_actual_soft_max_freq = fine_grain_max_freq; + smum_send_msg_to_smc_with_parameter(hwmgr, PPSMC_MSG_SetHardMinGfxClk, data->gfx_min_freq_limit/100, @@ -759,6 +809,7 @@ static int smu10_dpm_force_dpm_level(struct pp_hwmgr *hwmgr, NULL); break; case AMD_DPM_FORCED_LEVEL_MANUAL: + data->fine_grain_enabled = 1; case AMD_DPM_FORCED_LEVEL_PROFILE_EXIT: default: break; @@ -1435,6 +1486,11 @@ static int smu10_set_fine_grain_clk_vol(struct pp_hwmgr *hwmgr, return -EINVAL; } + if (!smu10_data->fine_grain_enabled) { + pr_err("Fine grain not started\n"); + return -EINVAL; + } + if (type == PP_OD_EDIT_SCLK_VDDC_TABLE) { if (size != 2) { pr_err("Input parameter number not correct\n"); diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.h b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.h index 28d86d354d50..808e0ecbe1f0 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.h +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu10_hwmgr.h @@ -300,6 +300,8 @@ struct smu10_hwmgr { bool need_min_deep_sleep_dcefclk; uint32_t deep_sleep_dcefclk; uint32_t num_active_display; + + bool fine_grain_enabled; }; struct pp_hwmgr; From 4f6a05501eb9c57fb4c9efed70840aee523a393b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 15:02:32 +0100 Subject: [PATCH 412/547] drm/amd/display: Fix unused variable warning Some of the newly added code is hidden inside of #ifdef blocks, but one variable is unused when debugfs is disabled: drivers/gpu/drm/amd/amdgpu/../display/amdgpu_dm/amdgpu_dm.c:8370:8: error: unused variable 'configure_crc' [-Werror,-Wunused-variable] Change the #ifdef to an if(IS_ENABLED()) check to fix the warning and avoid adding more #ifdefs. Fixes: c920888c604d ("drm/amd/display: Expose new CRC window property") Reviewed-by: Wayne Lin Signed-off-by: Arnd Bergmann Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 +--- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 519080e9a233..3fb6baf9b0ba 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -8379,8 +8379,7 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) acrtc->dm_irq_params.stream = dm_new_crtc_state->stream; manage_dm_interrupts(adev, acrtc, true); } -#ifdef CONFIG_DEBUG_FS - if (new_crtc_state->active && + if (IS_ENABLED(CONFIG_DEBUG_FS) && new_crtc_state->active && amdgpu_dm_is_valid_crc_source(dm_new_crtc_state->crc_src)) { /** * Frontend may have changed so reapply the CRC capture @@ -8401,7 +8400,6 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) amdgpu_dm_crtc_configure_crc_source( crtc, dm_new_crtc_state, dm_new_crtc_state->crc_src); } -#endif } for_each_new_crtc_in_state(state, crtc, new_crtc_state, j) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h index 0235bfb246e5..eba2f1d35d07 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h @@ -46,13 +46,13 @@ static inline bool amdgpu_dm_is_valid_crc_source(enum amdgpu_dm_pipe_crc_source } /* amdgpu_dm_crc.c */ -#ifdef CONFIG_DEBUG_FS bool amdgpu_dm_crc_window_is_default(struct dm_crtc_state *dm_crtc_state); bool amdgpu_dm_crc_window_changed(struct dm_crtc_state *dm_new_crtc_state, struct dm_crtc_state *dm_old_crtc_state); int amdgpu_dm_crtc_configure_crc_source(struct drm_crtc *crtc, struct dm_crtc_state *dm_crtc_state, enum amdgpu_dm_pipe_crc_source source); +#ifdef CONFIG_DEBUG_FS int amdgpu_dm_crtc_set_crc_source(struct drm_crtc *crtc, const char *src_name); int amdgpu_dm_crtc_verify_crc_source(struct drm_crtc *crtc, const char *src_name, From e6d5c64efaa34aae3815a9afeb1314a976142e83 Mon Sep 17 00:00:00 2001 From: Jiawei Gu Date: Tue, 29 Dec 2020 20:35:33 +0800 Subject: [PATCH 413/547] drm/amdgpu: fix potential memory leak during navi12 deinitialization Navi12 HDCP & DTM deinitialization needs continue to free bo if already created though initialized flag is not set. Reviewed-by: Alex Deucher Signed-off-by: Jiawei Gu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 5d6fc369e32c..347fec669424 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1315,8 +1315,12 @@ static int psp_hdcp_terminate(struct psp_context *psp) if (amdgpu_sriov_vf(psp->adev)) return 0; - if (!psp->hdcp_context.hdcp_initialized) - return 0; + if (!psp->hdcp_context.hdcp_initialized) { + if (psp->hdcp_context.hdcp_shared_buf) + goto out; + else + return 0; + } ret = psp_hdcp_unload(psp); if (ret) @@ -1324,6 +1328,7 @@ static int psp_hdcp_terminate(struct psp_context *psp) psp->hdcp_context.hdcp_initialized = false; +out: /* free hdcp shared memory */ amdgpu_bo_free_kernel(&psp->hdcp_context.hdcp_shared_bo, &psp->hdcp_context.hdcp_shared_mc_addr, @@ -1462,8 +1467,12 @@ static int psp_dtm_terminate(struct psp_context *psp) if (amdgpu_sriov_vf(psp->adev)) return 0; - if (!psp->dtm_context.dtm_initialized) - return 0; + if (!psp->dtm_context.dtm_initialized) { + if (psp->dtm_context.dtm_shared_buf) + goto out; + else + return 0; + } ret = psp_dtm_unload(psp); if (ret) @@ -1471,6 +1480,7 @@ static int psp_dtm_terminate(struct psp_context *psp) psp->dtm_context.dtm_initialized = false; +out: /* free hdcp shared memory */ amdgpu_bo_free_kernel(&psp->dtm_context.dtm_shared_bo, &psp->dtm_context.dtm_shared_mc_addr, From 8a82b347e8732fd2b68d26a6e9f0d9a1c397560d Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Tue, 5 Jan 2021 08:37:21 +0800 Subject: [PATCH 414/547] drm/amdgpu: fix no bad_pages issue after umc ue injection old code wrongly used the bad page status as the function return value, which cause amdgpu_ras_badpages_read always return failed. Signed-off-by: Dennis Li Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c136bd449744..82e952696d24 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1518,7 +1518,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; int i = 0; - int ret = 0; + int ret = 0, status; if (!con || !con->eh_data || !bps || !count) return -EINVAL; @@ -1543,12 +1543,12 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, .size = AMDGPU_GPU_PAGE_SIZE, .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED, }; - ret = amdgpu_vram_mgr_query_page_status( + status = amdgpu_vram_mgr_query_page_status( ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), data->bps[i].retired_page); - if (ret == -EBUSY) + if (status == -EBUSY) (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; - else if (ret == -ENOENT) + else if (status == -ENOENT) (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; } From 3851c90b7aa8f0c275d14636f0e7ccca69a2bf84 Mon Sep 17 00:00:00 2001 From: John Clements Date: Tue, 5 Jan 2021 14:53:14 +0800 Subject: [PATCH 415/547] drm/amdgpu: enable ras eeprom support for sienna cichlid added I2C address and asic support flag Reviewed-by: Hawking Zhang Signed-off-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 1dd040166c63..19d9aa76cfbf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -30,6 +30,7 @@ #define EEPROM_I2C_TARGET_ADDR_VEGA20 0xA0 #define EEPROM_I2C_TARGET_ADDR_ARCTURUS 0xA8 #define EEPROM_I2C_TARGET_ADDR_ARCTURUS_D342 0xA0 +#define EEPROM_I2C_TARGET_ADDR_SIENNA_CICHLID 0xA0 /* * The 2 macros bellow represent the actual size in bytes that @@ -62,7 +63,8 @@ static bool __is_ras_eeprom_supported(struct amdgpu_device *adev) { if ((adev->asic_type == CHIP_VEGA20) || - (adev->asic_type == CHIP_ARCTURUS)) + (adev->asic_type == CHIP_ARCTURUS) || + (adev->asic_type == CHIP_SIENNA_CICHLID)) return true; return false; @@ -100,6 +102,10 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev, case CHIP_ARCTURUS: return __get_eeprom_i2c_addr_arct(adev, i2c_addr); + case CHIP_SIENNA_CICHLID: + *i2c_addr = EEPROM_I2C_TARGET_ADDR_SIENNA_CICHLID; + break; + default: return false; } From c241ed2f0ea549c18cff62a3708b43846b84dae3 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 4 Jan 2021 11:24:20 -0500 Subject: [PATCH 416/547] drm/amdgpu/display: drop DCN support for aarch64 From Ard: "Simply disabling -mgeneral-regs-only left and right is risky, given that the standard AArch64 ABI permits the use of FP/SIMD registers anywhere, and GCC is known to use SIMD registers for spilling, and may invent other uses of the FP/SIMD register file that have nothing to do with the floating point code in question. Note that putting kernel_neon_begin() and kernel_neon_end() around the code that does use FP is not sufficient here, the problem is in all the other code that may be emitted with references to SIMD registers in it. So the only way to do this properly is to put all floating point code in a separate compilation unit, and only compile that unit with -mgeneral-regs-only." Disable support until the code can be properly refactored to support this properly on aarch64. Acked-by: Will Deacon Reported-by: Ard Biesheuvel Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/Kconfig | 2 +- drivers/gpu/drm/amd/display/dc/calcs/Makefile | 4 ---- .../gpu/drm/amd/display/dc/clk_mgr/Makefile | 21 ------------------- drivers/gpu/drm/amd/display/dc/dcn10/Makefile | 7 ------- .../drm/amd/display/dc/dcn10/dcn10_resource.c | 7 ------- drivers/gpu/drm/amd/display/dc/dcn20/Makefile | 4 ---- drivers/gpu/drm/amd/display/dc/dcn21/Makefile | 4 ---- drivers/gpu/drm/amd/display/dc/dcn30/Makefile | 5 ----- .../gpu/drm/amd/display/dc/dcn301/Makefile | 4 ---- .../gpu/drm/amd/display/dc/dcn302/Makefile | 4 ---- drivers/gpu/drm/amd/display/dc/dml/Makefile | 4 ---- drivers/gpu/drm/amd/display/dc/dsc/Makefile | 4 ---- drivers/gpu/drm/amd/display/dc/os_types.h | 4 ---- 13 files changed, 1 insertion(+), 73 deletions(-) diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig index 797b5d4b43e5..e509a175ed17 100644 --- a/drivers/gpu/drm/amd/display/Kconfig +++ b/drivers/gpu/drm/amd/display/Kconfig @@ -6,7 +6,7 @@ config DRM_AMD_DC bool "AMD DC - Enable new display engine" default y select SND_HDA_COMPONENT if SND_HDA_CORE - select DRM_AMD_DC_DCN if (X86 || PPC64 || (ARM64 && KERNEL_MODE_NEON)) && !(KCOV_INSTRUMENT_ALL && KCOV_ENABLE_COMPARISONS) + select DRM_AMD_DC_DCN if (X86 || PPC64) && !(KCOV_INSTRUMENT_ALL && KCOV_ENABLE_COMPARISONS) help Choose this option if you want to use the new display engine support for AMDGPU. This adds required support for Vega and diff --git a/drivers/gpu/drm/amd/display/dc/calcs/Makefile b/drivers/gpu/drm/amd/display/dc/calcs/Makefile index 64f515d74410..f3c00f479e1c 100644 --- a/drivers/gpu/drm/amd/display/dc/calcs/Makefile +++ b/drivers/gpu/drm/amd/display/dc/calcs/Makefile @@ -33,10 +33,6 @@ ifdef CONFIG_PPC64 calcs_ccflags := -mhard-float -maltivec endif -ifdef CONFIG_ARM64 -calcs_rcflags := -mgeneral-regs-only -endif - ifdef CONFIG_CC_IS_GCC ifeq ($(call cc-ifversion, -lt, 0701, y), y) IS_OLD_GCC = 1 diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/Makefile b/drivers/gpu/drm/amd/display/dc/clk_mgr/Makefile index d59b380e7b7f..ff96bee57bfc 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/Makefile +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/Makefile @@ -104,13 +104,6 @@ ifdef CONFIG_PPC64 CFLAGS_$(AMDDALPATH)/dc/clk_mgr/dcn21/rn_clk_mgr.o := $(call cc-option,-mno-gnu-attribute) endif -# prevent build errors: -# ...: '-mgeneral-regs-only' is incompatible with the use of floating-point types -# this file is unused on arm64, just like on ppc64 -ifdef CONFIG_ARM64 -CFLAGS_REMOVE_$(AMDDALPATH)/dc/clk_mgr/dcn21/rn_clk_mgr.o := -mgeneral-regs-only -endif - AMD_DAL_CLK_MGR_DCN21 = $(addprefix $(AMDDALPATH)/dc/clk_mgr/dcn21/,$(CLK_MGR_DCN21)) AMD_DISPLAY_FILES += $(AMD_DAL_CLK_MGR_DCN21) @@ -125,13 +118,6 @@ ifdef CONFIG_PPC64 CFLAGS_$(AMDDALPATH)/dc/clk_mgr/dcn30/dcn30_clk_mgr.o := $(call cc-option,-mno-gnu-attribute) endif -# prevent build errors: -# ...: '-mgeneral-regs-only' is incompatible with the use of floating-point types -# this file is unused on arm64, just like on ppc64 -ifdef CONFIG_ARM64 -CFLAGS_REMOVE_$(AMDDALPATH)/dc/clk_mgr/dcn30/dcn30_clk_mgr.o := -mgeneral-regs-only -endif - AMD_DAL_CLK_MGR_DCN30 = $(addprefix $(AMDDALPATH)/dc/clk_mgr/dcn30/,$(CLK_MGR_DCN30)) AMD_DISPLAY_FILES += $(AMD_DAL_CLK_MGR_DCN30) @@ -146,13 +132,6 @@ ifdef CONFIG_PPC64 CFLAGS_$(AMDDALPATH)/dc/clk_mgr/dcn301/vg_clk_mgr.o := $(call cc-option,-mno-gnu-attribute) endif -# prevent build errors: -# ...: '-mgeneral-regs-only' is incompatible with the use of floating-point types -# this file is unused on arm64, just like on ppc64 -ifdef CONFIG_ARM64 -CFLAGS_REMOVE_$(AMDDALPATH)/dc/clk_mgr/dcn301/vg_clk_mgr.o := -mgeneral-regs-only -endif - AMD_DAL_CLK_MGR_DCN301 = $(addprefix $(AMDDALPATH)/dc/clk_mgr/dcn301/,$(CLK_MGR_DCN301)) AMD_DISPLAY_FILES += $(AMD_DAL_CLK_MGR_DCN301) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/Makefile b/drivers/gpu/drm/amd/display/dc/dcn10/Makefile index 733e6e6e43bd..62ad1a11bff9 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dcn10/Makefile @@ -31,11 +31,4 @@ DCN10 = dcn10_init.o dcn10_resource.o dcn10_ipp.o dcn10_hw_sequencer.o \ AMD_DAL_DCN10 = $(addprefix $(AMDDALPATH)/dc/dcn10/,$(DCN10)) -# fix: -# ...: '-mgeneral-regs-only' is incompatible with the use of floating-point types -# aarch64 does not support soft-float, so use hard-float and handle this in code -ifdef CONFIG_ARM64 -CFLAGS_REMOVE_$(AMDDALPATH)/dc/dcn10/dcn10_resource.o := -mgeneral-regs-only -endif - AMD_DISPLAY_FILES += $(AMD_DAL_DCN10) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c index bdc37831535e..36745193c391 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c @@ -1534,15 +1534,8 @@ static bool dcn10_resource_construct( memcpy(dc->dcn_ip, &dcn10_ip_defaults, sizeof(dcn10_ip_defaults)); memcpy(dc->dcn_soc, &dcn10_soc_defaults, sizeof(dcn10_soc_defaults)); -#if defined(CONFIG_ARM64) - /* Aarch64 does not support -msoft-float/-mfloat-abi=soft */ - DC_FP_START(); - dcn10_resource_construct_fp(dc); - DC_FP_END(); -#else /* Other architectures we build for build this with soft-float */ dcn10_resource_construct_fp(dc); -#endif pool->base.pp_smu = dcn10_pp_smu_create(ctx); diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/Makefile b/drivers/gpu/drm/amd/display/dc/dcn20/Makefile index 624cb1341ef1..5fcaf78334ff 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dcn20/Makefile @@ -17,10 +17,6 @@ ifdef CONFIG_PPC64 CFLAGS_$(AMDDALPATH)/dc/dcn20/dcn20_resource.o := -mhard-float -maltivec endif -ifdef CONFIG_ARM64 -CFLAGS_REMOVE_$(AMDDALPATH)/dc/dcn20/dcn20_resource.o := -mgeneral-regs-only -endif - ifdef CONFIG_CC_IS_GCC ifeq ($(call cc-ifversion, -lt, 0701, y), y) IS_OLD_GCC = 1 diff --git a/drivers/gpu/drm/amd/display/dc/dcn21/Makefile b/drivers/gpu/drm/amd/display/dc/dcn21/Makefile index 1ee5fc03b7b3..bb8c95141082 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn21/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dcn21/Makefile @@ -13,10 +13,6 @@ ifdef CONFIG_PPC64 CFLAGS_$(AMDDALPATH)/dc/dcn21/dcn21_resource.o := -mhard-float -maltivec endif -ifdef CONFIG_ARM64 -CFLAGS_REMOVE_$(AMDDALPATH)/dc/dcn21/dcn21_resource.o := -mgeneral-regs-only -endif - ifdef CONFIG_CC_IS_GCC ifeq ($(call cc-ifversion, -lt, 0701, y), y) IS_OLD_GCC = 1 diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/Makefile b/drivers/gpu/drm/amd/display/dc/dcn30/Makefile index 248c2711aace..c20331eb62e0 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dcn30/Makefile @@ -41,11 +41,6 @@ CFLAGS_$(AMDDALPATH)/dc/dcn30/dcn30_resource.o := -mhard-float -maltivec CFLAGS_$(AMDDALPATH)/dc/dcn30/dcn30_optc.o := -mhard-float -maltivec endif -ifdef CONFIG_ARM64 -CFLAGS_REMOVE_$(AMDDALPATH)/dc/dcn30/dcn30_resource.o := -mgeneral-regs-only -CFLAGS_REMOVE_$(AMDDALPATH)/dc/dcn30/dcn30_optc.o := -mgeneral-regs-only -endif - ifdef CONFIG_CC_IS_GCC ifeq ($(call cc-ifversion, -lt, 0701, y), y) IS_OLD_GCC = 1 diff --git a/drivers/gpu/drm/amd/display/dc/dcn301/Makefile b/drivers/gpu/drm/amd/display/dc/dcn301/Makefile index 2fd5d34e4ba6..3ca7d911d25c 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn301/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dcn301/Makefile @@ -21,10 +21,6 @@ ifdef CONFIG_PPC64 CFLAGS_$(AMDDALPATH)/dc/dcn301/dcn301_resource.o := -mhard-float -maltivec endif -ifdef CONFIG_ARM64 -CFLAGS_REMOVE_$(AMDDALPATH)/dc/dcn301/dcn301_resource.o := -mgeneral-regs-only -endif - ifdef CONFIG_CC_IS_GCC ifeq ($(call cc-ifversion, -lt, 0701, y), y) IS_OLD_GCC = 1 diff --git a/drivers/gpu/drm/amd/display/dc/dcn302/Makefile b/drivers/gpu/drm/amd/display/dc/dcn302/Makefile index 36e44e1b07fa..8d4924b7dc22 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn302/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dcn302/Makefile @@ -20,10 +20,6 @@ ifdef CONFIG_PPC64 CFLAGS_$(AMDDALPATH)/dc/dcn302/dcn302_resource.o := -mhard-float -maltivec endif -ifdef CONFIG_ARM64 -CFLAGS_REMOVE_$(AMDDALPATH)/dc/dcn302/dcn302_resource.o := -mgeneral-regs-only -endif - ifdef CONFIG_CC_IS_GCC ifeq ($(call cc-ifversion, -lt, 0701, y), y) IS_OLD_GCC = 1 diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile b/drivers/gpu/drm/amd/display/dc/dml/Makefile index a02a33dcd70b..6bb7f2905821 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile @@ -33,10 +33,6 @@ ifdef CONFIG_PPC64 dml_ccflags := -mhard-float -maltivec endif -ifdef CONFIG_ARM64 -dml_rcflags := -mgeneral-regs-only -endif - ifdef CONFIG_CC_IS_GCC ifeq ($(call cc-ifversion, -lt, 0701, y), y) IS_OLD_GCC = 1 diff --git a/drivers/gpu/drm/amd/display/dc/dsc/Makefile b/drivers/gpu/drm/amd/display/dc/dsc/Makefile index f2624a1156e5..8d31eb75c6a6 100644 --- a/drivers/gpu/drm/amd/display/dc/dsc/Makefile +++ b/drivers/gpu/drm/amd/display/dc/dsc/Makefile @@ -10,10 +10,6 @@ ifdef CONFIG_PPC64 dsc_ccflags := -mhard-float -maltivec endif -ifdef CONFIG_ARM64 -dsc_rcflags := -mgeneral-regs-only -endif - ifdef CONFIG_CC_IS_GCC ifeq ($(call cc-ifversion, -lt, 0701, y), y) IS_OLD_GCC = 1 diff --git a/drivers/gpu/drm/amd/display/dc/os_types.h b/drivers/gpu/drm/amd/display/dc/os_types.h index 95cb56929e79..126c2f3a4dd3 100644 --- a/drivers/gpu/drm/amd/display/dc/os_types.h +++ b/drivers/gpu/drm/amd/display/dc/os_types.h @@ -55,10 +55,6 @@ #include #define DC_FP_START() kernel_fpu_begin() #define DC_FP_END() kernel_fpu_end() -#elif defined(CONFIG_ARM64) -#include -#define DC_FP_START() kernel_neon_begin() -#define DC_FP_END() kernel_neon_end() #elif defined(CONFIG_PPC64) #include #include From 5efc1f4b454c6179d35e7b0c3eda0ad5763a00fc Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 5 Jan 2021 11:42:08 -0500 Subject: [PATCH 417/547] Revert "drm/amd/display: Fix memory leaks in S3 resume" This reverts commit a135a1b4c4db1f3b8cbed9676a40ede39feb3362. This leads to blank screens on some boards after replugging a display. Revert until we understand the root cause and can fix both the leak and the blank screen after replug. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=211033 Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1427 Cc: Stylon Wang Cc: Harry Wentland Cc: Nicholas Kazlauskas Cc: Andre Tomt Cc: Oleksandr Natalenko Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 3fb6baf9b0ba..146486071d01 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2386,8 +2386,7 @@ void amdgpu_dm_update_connector_after_detect( drm_connector_update_edid_property(connector, aconnector->edid); - aconnector->num_modes = drm_add_edid_modes(connector, aconnector->edid); - drm_connector_list_update(connector); + drm_add_edid_modes(connector, aconnector->edid); if (aconnector->dc_link->aux_mode) drm_dp_cec_set_edid(&aconnector->dm_dp_aux.aux, From 67a5a68013056cbcf0a647e36cb6f4622fb6a470 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valdis=20Kl=C4=93tnieks?= Date: Sat, 26 Dec 2020 13:21:58 -0500 Subject: [PATCH 418/547] gcc-plugins: fix gcc 11 indigestion with plugins... Fedora Rawhide has started including gcc 11,and the g++ compiler throws a wobbly when it hits scripts/gcc-plugins: HOSTCXX scripts/gcc-plugins/latent_entropy_plugin.so In file included from /usr/include/c++/11/type_traits:35, from /usr/lib/gcc/x86_64-redhat-linux/11/plugin/include/system.h:244, from /usr/lib/gcc/x86_64-redhat-linux/11/plugin/include/gcc-plugin.h:28, from scripts/gcc-plugins/gcc-common.h:7, from scripts/gcc-plugins/latent_entropy_plugin.c:78: /usr/include/c++/11/bits/c++0x_warning.h:32:2: error: #error This file requires compiler and library support for the ISO C++ 2011 standard. This support must be enabled with the -std=c++11 or -std=gnu++11 compiler options. 32 | #error This file requires compiler and library support \ In fact, it works just fine with c++11, which has been in gcc since 4.8, and we now require 4.9 as a minimum. Signed-off-by: Valdis Kletnieks Acked-by: Josh Poimboeuf Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/82487.1609006918@turing-police --- scripts/gcc-plugins/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gcc-plugins/Makefile b/scripts/gcc-plugins/Makefile index d66949bfeba4..b5487cce69e8 100644 --- a/scripts/gcc-plugins/Makefile +++ b/scripts/gcc-plugins/Makefile @@ -22,9 +22,9 @@ always-y += $(GCC_PLUGIN) GCC_PLUGINS_DIR = $(shell $(CC) -print-file-name=plugin) plugin_cxxflags = -Wp,-MMD,$(depfile) $(KBUILD_HOSTCXXFLAGS) -fPIC \ - -I $(GCC_PLUGINS_DIR)/include -I $(obj) -std=gnu++98 \ + -I $(GCC_PLUGINS_DIR)/include -I $(obj) -std=gnu++11 \ -fno-rtti -fno-exceptions -fasynchronous-unwind-tables \ - -ggdb -Wno-narrowing -Wno-unused-variable -Wno-c++11-compat \ + -ggdb -Wno-narrowing -Wno-unused-variable \ -Wno-format-diag plugin_ldflags = -shared From 6f02b540d7597f357bc6ee711346761045d4e108 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Wed, 6 Jan 2021 15:59:06 +0000 Subject: [PATCH 419/547] bpftool: Fix compilation failure for net.o with older glibc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For older glibc ~2.17, #include'ing both linux/if.h and net/if.h fails due to complaints about redefinition of interface flags: CC net.o In file included from net.c:13:0: /usr/include/linux/if.h:71:2: error: redeclaration of enumerator ‘IFF_UP’ IFF_UP = 1<<0, /* sysfs */ ^ /usr/include/net/if.h:44:5: note: previous definition of ‘IFF_UP’ was here IFF_UP = 0x1, /* Interface is up. */ The issue was fixed in kernel headers in [1], but since compilation of net.c picks up system headers the problem can recur. Dropping #include resolves the issue and it is not needed for compilation anyhow. [1] https://lore.kernel.org/netdev/1461512707-23058-1-git-send-email-mikko.rapeli__34748.27880641$1462831734$gmane$org@iki.fi/ Fixes: f6f3bac08ff9 ("tools/bpf: bpftool: add net support") Signed-off-by: Alan Maguire Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/1609948746-15369-1-git-send-email-alan.maguire@oracle.com --- tools/bpf/bpftool/net.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index 3fae61ef6339..ff3aa0cf3997 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include From 0d136f5cd9a7ba6ded7f8ff17e8b1ba680f37625 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 5 Jan 2021 18:23:33 +0100 Subject: [PATCH 420/547] net: mvneta: fix error message when MTU too large for XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The error message says that "Jumbo frames are not supported on XDP", but the code checks for mtu > MVNETA_MAX_RX_BUF_SIZE, not mtu > 1500. Fix this error message. Signed-off-by: Marek Behún Fixes: 0db51da7a8e9 ("net: mvneta: add basic XDP support") Cc: Lorenzo Bianconi Cc: Thomas Petazzoni Link: https://lore.kernel.org/r/20210105172333.21613-1-kabel@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/mvneta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 3369ec717a51..bc4d8d144401 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -4432,7 +4432,7 @@ static int mvneta_xdp_setup(struct net_device *dev, struct bpf_prog *prog, struct bpf_prog *old_prog; if (prog && dev->mtu > MVNETA_MAX_RX_BUF_SIZE) { - NL_SET_ERR_MSG_MOD(extack, "Jumbo frames not supported on XDP"); + NL_SET_ERR_MSG_MOD(extack, "MTU too large for XDP"); return -EOPNOTSUPP; } From 94bcfdbff0c210b17b27615f4952cc6ece7d5f5f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 5 Jan 2021 11:07:25 -0800 Subject: [PATCH 421/547] net: bareudp: add missing error handling for bareudp_link_config() .dellink does not get called after .newlink fails, bareudp_newlink() must undo what bareudp_configure() has done if bareudp_link_config() fails. v2: call bareudp_dellink(), like bareudp_dev_create() does Fixes: 571912c69f0e ("net: UDP tunnel encapsulation module for tunnelling different protocols like MPLS, IP, NSH etc.") Link: https://lore.kernel.org/r/20210105190725.1736246-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/bareudp.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 708171c0d628..85de5f96c02b 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -645,11 +645,20 @@ static int bareudp_link_config(struct net_device *dev, return 0; } +static void bareudp_dellink(struct net_device *dev, struct list_head *head) +{ + struct bareudp_dev *bareudp = netdev_priv(dev); + + list_del(&bareudp->next); + unregister_netdevice_queue(dev, head); +} + static int bareudp_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct bareudp_conf conf; + LIST_HEAD(list_kill); int err; err = bareudp2info(data, &conf, extack); @@ -662,17 +671,14 @@ static int bareudp_newlink(struct net *net, struct net_device *dev, err = bareudp_link_config(dev, tb); if (err) - return err; + goto err_unconfig; return 0; -} -static void bareudp_dellink(struct net_device *dev, struct list_head *head) -{ - struct bareudp_dev *bareudp = netdev_priv(dev); - - list_del(&bareudp->next); - unregister_netdevice_queue(dev, head); +err_unconfig: + bareudp_dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + return err; } static size_t bareudp_get_size(const struct net_device *dev) From 7f847db3040897f3ee25ce97265c545b5561f6c2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 5 Jan 2021 18:18:15 -0800 Subject: [PATCH 422/547] net: dsa: fix led_classdev build errors Fix build errors when LEDS_CLASS=m and NET_DSA_HIRSCHMANN_HELLCREEK=y. This limits the latter to =m when LEDS_CLASS=m. microblaze-linux-ld: drivers/net/dsa/hirschmann/hellcreek_ptp.o: in function `hellcreek_ptp_setup': (.text+0xf80): undefined reference to `led_classdev_register_ext' microblaze-linux-ld: (.text+0xf94): undefined reference to `led_classdev_register_ext' microblaze-linux-ld: drivers/net/dsa/hirschmann/hellcreek_ptp.o: in function `hellcreek_ptp_free': (.text+0x1018): undefined reference to `led_classdev_unregister' microblaze-linux-ld: (.text+0x1024): undefined reference to `led_classdev_unregister' Signed-off-by: Randy Dunlap Reported-by: kernel test robot Link: lore.kernel.org/r/202101060655.iUvMJqS2-lkp@intel.com Cc: Kurt Kanzenbach Link: https://lore.kernel.org/r/20210106021815.31796-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/hirschmann/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/hirschmann/Kconfig b/drivers/net/dsa/hirschmann/Kconfig index 222dd35e2c9d..e01191107a4b 100644 --- a/drivers/net/dsa/hirschmann/Kconfig +++ b/drivers/net/dsa/hirschmann/Kconfig @@ -4,6 +4,7 @@ config NET_DSA_HIRSCHMANN_HELLCREEK depends on HAS_IOMEM depends on NET_DSA depends on PTP_1588_CLOCK + depends on LEDS_CLASS select NET_DSA_TAG_HELLCREEK help This driver adds support for Hirschmann Hellcreek TSN switches. From 1f685e6adbbe3c7b1bd9053be771b898d9efa655 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 5 Jan 2021 20:25:31 -0800 Subject: [PATCH 423/547] ptp: ptp_ines: prevent build when HAS_IOMEM is not set ptp_ines.c uses devm_platform_ioremap_resource(), which is only built/available when CONFIG_HAS_IOMEM is enabled. CONFIG_HAS_IOMEM is not enabled for arch/s390/, so builds on S390 have a build error: s390-linux-ld: drivers/ptp/ptp_ines.o: in function `ines_ptp_ctrl_probe': ptp_ines.c:(.text+0x17e6): undefined reference to `devm_platform_ioremap_resource' Prevent builds of ptp_ines.c when HAS_IOMEM is not set. Fixes: bad1eaa6ac31 ("ptp: Add a driver for InES time stamping IP core.") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Link: lore.kernel.org/r/202101031125.ZEFCUiKi-lkp@intel.com Acked-by: Richard Cochran Link: https://lore.kernel.org/r/20210106042531.1351-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- drivers/ptp/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index d2bf05ccbbe2..f2edef0df40f 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -79,6 +79,7 @@ config DP83640_PHY config PTP_1588_CLOCK_INES tristate "ZHAW InES PTP time stamping IP core" depends on NETWORK_PHY_TIMESTAMPING + depends on HAS_IOMEM depends on PHYLIB depends on PTP_1588_CLOCK help From c4aec381ab98c9189d47b935832541d520f1f67f Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 15 Dec 2020 11:32:37 +0100 Subject: [PATCH 424/547] can: m_can: m_can_class_unregister(): remove erroneous m_can_clk_stop() In m_can_class_register() the clock is started, but stopped on exit. When calling m_can_class_unregister(), the clock is stopped a second time. This patch removes the erroneous m_can_clk_stop() in m_can_class_unregister(). Fixes: f524f829b75a ("can: m_can: Create a m_can platform framework") Cc: Dan Murphy Cc: Sriram Dash Reviewed-by: Sean Nyekjaer Link: https://lore.kernel.org/r/20201215103238.524029-2-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 2c9f12401276..da551fd0f502 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1852,8 +1852,6 @@ EXPORT_SYMBOL_GPL(m_can_class_register); void m_can_class_unregister(struct m_can_classdev *cdev) { unregister_candev(cdev->net); - - m_can_clk_stop(cdev); } EXPORT_SYMBOL_GPL(m_can_class_unregister); From aee2b3ccc8a63d1cd7da6a8a153d1f3712d40826 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 15 Dec 2020 11:32:38 +0100 Subject: [PATCH 425/547] can: tcan4x5x: fix bittiming const, use common bittiming from m_can driver According to the TCAN4550 datasheet "SLLSF91 - DECEMBER 2018" the tcan4x5x has the same bittiming constants as a m_can revision 3.2.x/3.3.0. The tcan4x5x chip I'm using identifies itself as m_can revision 3.2.1, so remove the tcan4x5x specific bittiming values and rely on the values in the m_can driver, which are selected according to core revision. Fixes: 5443c226ba91 ("can: tcan4x5x: Add tcan4x5x driver to the kernel") Cc: Dan Murphy Reviewed-by: Sean Nyekjaer Link: https://lore.kernel.org/r/20201215103238.524029-3-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/tcan4x5x.c | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/drivers/net/can/m_can/tcan4x5x.c b/drivers/net/can/m_can/tcan4x5x.c index 24c737c4fc44..970f0e9d19bf 100644 --- a/drivers/net/can/m_can/tcan4x5x.c +++ b/drivers/net/can/m_can/tcan4x5x.c @@ -131,30 +131,6 @@ static inline struct tcan4x5x_priv *cdev_to_priv(struct m_can_classdev *cdev) } -static struct can_bittiming_const tcan4x5x_bittiming_const = { - .name = DEVICE_NAME, - .tseg1_min = 2, - .tseg1_max = 31, - .tseg2_min = 2, - .tseg2_max = 16, - .sjw_max = 16, - .brp_min = 1, - .brp_max = 32, - .brp_inc = 1, -}; - -static struct can_bittiming_const tcan4x5x_data_bittiming_const = { - .name = DEVICE_NAME, - .tseg1_min = 1, - .tseg1_max = 32, - .tseg2_min = 1, - .tseg2_max = 16, - .sjw_max = 16, - .brp_min = 1, - .brp_max = 32, - .brp_inc = 1, -}; - static void tcan4x5x_check_wake(struct tcan4x5x_priv *priv) { int wake_state = 0; @@ -469,8 +445,6 @@ static int tcan4x5x_can_probe(struct spi_device *spi) mcan_class->dev = &spi->dev; mcan_class->ops = &tcan4x5x_ops; mcan_class->is_peripheral = true; - mcan_class->bit_timing = &tcan4x5x_bittiming_const; - mcan_class->data_timing = &tcan4x5x_data_bittiming_const; mcan_class->net->irq = spi->irq; spi_set_drvdata(spi, priv); From 6086f02a18aeae795a61a3fc6566920891ea3b52 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 5 Jan 2021 22:41:37 +0100 Subject: [PATCH 426/547] can: mcp251xfd: mcp251xfd_handle_tefif(): fix TEF vs. TX race condition The mcp251xfd driver uses a TX FIFO for sending CAN frames and a TX Event FIFO (TEF) for completed TX-requests. The TEF event handling in the mcp251xfd_handle_tefif() function has a race condition. It first increments the tx-ring's tail counter to signal that there's room in the TX and TEF FIFO, then it increments the TEF FIFO in hardware. A running mcp251xfd_start_xmit() on a different CPU might not stop the txqueue (as the tx-ring still shows free space). The next mcp251xfd_start_xmit() will push a message into the chip and the TX complete event might overflow the TEF FIFO. This patch changes the order to fix the problem. Fixes: 68c0c1c7f966 ("can: mcp251xfd: tef-path: reduce number of SPI core requests to set UINC bit") Link: https://lore.kernel.org/r/20210105214138.3150886-2-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 77129d5f410b..85a1a8b7c0e7 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -1368,13 +1368,10 @@ static int mcp251xfd_handle_tefif(struct mcp251xfd_priv *priv) struct mcp251xfd_tx_ring *tx_ring = priv->tx; struct spi_transfer *last_xfer; - tx_ring->tail += len; - /* Increment the TEF FIFO tail pointer 'len' times in * a single SPI message. - */ - - /* Note: + * + * Note: * * "cs_change == 1" on the last transfer results in an * active chip select after the complete SPI @@ -1391,6 +1388,8 @@ static int mcp251xfd_handle_tefif(struct mcp251xfd_priv *priv) if (err) return err; + tx_ring->tail += len; + err = mcp251xfd_check_tef_tail(priv); if (err) return err; From 2fbb397f584077e3c90abd06829f5a1f66fdd5f4 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Tue, 5 Jan 2021 22:41:38 +0100 Subject: [PATCH 427/547] can: mcp251xfd: mcp251xfd_handle_rxif_ring(): first increment RX tail pointer in HW, then in driver The previous patch fixes a TEF vs. TX race condition, by first updating the TEF tail pointer in hardware, and then updating the driver internal pointer. The same pattern exists in the RX-path, too. This should be no problem, as the driver accesses the RX-FIFO from the interrupt handler only, thus the access is properly serialized. Fix the order here, too, so that the TEF- and RX-path look similar. Fixes: 1f652bb6bae7 ("can: mcp25xxfd: rx-path: reduce number of SPI core requests to set UINC bit") Link: https://lore.kernel.org/r/20210105214138.3150886-3-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 85a1a8b7c0e7..36235afb0bc6 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -1552,10 +1552,8 @@ mcp251xfd_handle_rxif_ring(struct mcp251xfd_priv *priv, /* Increment the RX FIFO tail pointer 'len' times in a * single SPI message. - */ - ring->tail += len; - - /* Note: + * + * Note: * * "cs_change == 1" on the last transfer results in an * active chip select after the complete SPI @@ -1571,6 +1569,8 @@ mcp251xfd_handle_rxif_ring(struct mcp251xfd_priv *priv, last_xfer->cs_change = 1; if (err) return err; + + ring->tail += len; } return 0; From 1169ec8f5d71044082a9898bbd1f1bf4a690c5a4 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Mon, 4 Jan 2021 09:03:27 +0000 Subject: [PATCH 428/547] can: rcar: Kconfig: update help description for CAN_RCAR config The rcar_can driver also supports RZ/G SoC's, update the description to reflect this. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20210104090327.6547-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Marc Kleine-Budde --- drivers/net/can/rcar/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/rcar/Kconfig b/drivers/net/can/rcar/Kconfig index 8d36101b78e3..29cabc20109e 100644 --- a/drivers/net/can/rcar/Kconfig +++ b/drivers/net/can/rcar/Kconfig @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 config CAN_RCAR - tristate "Renesas R-Car CAN controller" + tristate "Renesas R-Car and RZ/G CAN controller" depends on ARCH_RENESAS || ARM help Say Y here if you want to use CAN controller found on Renesas R-Car - SoCs. + or RZ/G SoCs. To compile this driver as a module, choose M here: the module will be called rcar_can. From 6ee49118f87cf02b36f68812bc49855b7b627a2b Mon Sep 17 00:00:00 2001 From: Sriram Dash Date: Mon, 4 Jan 2021 18:01:34 +0530 Subject: [PATCH 429/547] MAINTAINERS: Update MCAN MMIO device driver maintainer Update Pankaj Sharma as maintainer for mcan mmio device driver as I will be moving to a different role. Signed-off-by: Sriram Dash Acked-by: Pankaj Sharma Link: https://lore.kernel.org/r/20210104123134.16930-1-sriram.dash@samsung.com Signed-off-by: Marc Kleine-Budde --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7c1e45c416b1..b15514a770e3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10847,7 +10847,7 @@ F: drivers/media/radio/radio-maxiradio* MCAN MMIO DEVICE DRIVER M: Dan Murphy -M: Sriram Dash +M: Pankaj Sharma L: linux-can@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/can/bosch,m_can.yaml From 83b5bd628f65e6b4d1924b307d6a88a57827bdb0 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 4 Jan 2021 12:36:14 +0000 Subject: [PATCH 430/547] arm64: Move PSTATE.TCO setting to separate functions For consistency with __uaccess_{disable,enable}_hw_pan(), move the PSTATE.TCO setting into dedicated __uaccess_{disable,enable}_tco() functions. Signed-off-by: Catalin Marinas Acked-by: Vincenzo Frascino Acked-by: Mark Rutland --- arch/arm64/include/asm/uaccess.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 6f986e09a781..f0fe0cc6abe0 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -176,10 +176,21 @@ static inline void __uaccess_enable_hw_pan(void) * The Tag check override (TCO) bit disables temporarily the tag checking * preventing the issue. */ -static inline void uaccess_disable_privileged(void) +static inline void __uaccess_disable_tco(void) { asm volatile(ALTERNATIVE("nop", SET_PSTATE_TCO(0), ARM64_MTE, CONFIG_KASAN_HW_TAGS)); +} + +static inline void __uaccess_enable_tco(void) +{ + asm volatile(ALTERNATIVE("nop", SET_PSTATE_TCO(1), + ARM64_MTE, CONFIG_KASAN_HW_TAGS)); +} + +static inline void uaccess_disable_privileged(void) +{ + __uaccess_disable_tco(); if (uaccess_ttbr0_disable()) return; @@ -189,8 +200,7 @@ static inline void uaccess_disable_privileged(void) static inline void uaccess_enable_privileged(void) { - asm volatile(ALTERNATIVE("nop", SET_PSTATE_TCO(1), - ARM64_MTE, CONFIG_KASAN_HW_TAGS)); + __uaccess_enable_tco(); if (uaccess_ttbr0_enable()) return; From 05cd84691eafcd7959a1e120d5e72c0dd98c5d91 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Tue, 5 Jan 2021 20:06:39 +0530 Subject: [PATCH 431/547] dmabuf: fix use-after-free of dmabuf's file->f_inode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is observed 'use-after-free' on the dmabuf's file->f_inode with the race between closing the dmabuf file and reading the dmabuf's debug info. Consider the below scenario where P1 is closing the dma_buf file and P2 is reading the dma_buf's debug info in the system: P1 P2 dma_buf_debug_show() dma_buf_put() __fput() file->f_op->release() dput() .... dentry_unlink_inode() iput(dentry->d_inode) (where the inode is freed) mutex_lock(&db_list.lock) read 'dma_buf->file->f_inode' (the same inode is freed by P1) mutex_unlock(&db_list.lock) dentry->d_op->d_release()--> dma_buf_release() ..... mutex_lock(&db_list.lock) removes the dmabuf from the list mutex_unlock(&db_list.lock) In the above scenario, when dma_buf_put() is called on a dma_buf, it first frees the dma_buf's file->f_inode(=dentry->d_inode) and then removes this dma_buf from the system db_list. In between P2 traversing the db_list tries to access this dma_buf's file->f_inode that was freed by P1 which is a use-after-free case. Since, __fput() calls f_op->release first and then later calls the d_op->d_release, move the dma_buf's db_list removal from d_release() to f_op->release(). This ensures that dma_buf's file->f_inode is not accessed after it is released. Cc: # 5.4.x- Fixes: 4ab59c3c638c ("dma-buf: Move dma_buf_release() from fops to dentry_ops") Acked-by: Christian König Signed-off-by: Charan Teja Reddy Signed-off-by: Sumit Semwal Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/1609857399-31549-1-git-send-email-charante@codeaurora.org --- drivers/dma-buf/dma-buf.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index e63684d4cd90..9ad6397aaa97 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -76,10 +76,6 @@ static void dma_buf_release(struct dentry *dentry) dmabuf->ops->release(dmabuf); - mutex_lock(&db_list.lock); - list_del(&dmabuf->list_node); - mutex_unlock(&db_list.lock); - if (dmabuf->resv == (struct dma_resv *)&dmabuf[1]) dma_resv_fini(dmabuf->resv); @@ -88,6 +84,22 @@ static void dma_buf_release(struct dentry *dentry) kfree(dmabuf); } +static int dma_buf_file_release(struct inode *inode, struct file *file) +{ + struct dma_buf *dmabuf; + + if (!is_dma_buf_file(file)) + return -EINVAL; + + dmabuf = file->private_data; + + mutex_lock(&db_list.lock); + list_del(&dmabuf->list_node); + mutex_unlock(&db_list.lock); + + return 0; +} + static const struct dentry_operations dma_buf_dentry_ops = { .d_dname = dmabuffs_dname, .d_release = dma_buf_release, @@ -413,6 +425,7 @@ static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file) } static const struct file_operations dma_buf_fops = { + .release = dma_buf_file_release, .mmap = dma_buf_mmap_internal, .llseek = dma_buf_llseek, .poll = dma_buf_poll, From e89eed02a5f1b864fa5abafc8e8e71bd9fd66d1f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 5 Jan 2021 20:53:42 +0100 Subject: [PATCH 432/547] kcov, usb: hide in_serving_softirq checks in __usb_hcd_giveback_urb Done opencode in_serving_softirq() checks in in_serving_softirq() to avoid cluttering the code, hide them in kcov helpers instead. Fixes: aee9ddb1d371 ("kcov, usb: only collect coverage from __usb_hcd_giveback_urb in softirq") Signed-off-by: Andrey Konovalov Link: https://lore.kernel.org/r/aeb430c5bb90b0ccdf1ec302c70831c1a47b9c45.1609876340.git.andreyknvl@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 8 +++----- include/linux/kcov.h | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 60886a7464c3..ad5a0f405a75 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1649,14 +1649,12 @@ static void __usb_hcd_giveback_urb(struct urb *urb) urb->status = status; /* * This function can be called in task context inside another remote - * coverage collection section, but KCOV doesn't support that kind of + * coverage collection section, but kcov doesn't support that kind of * recursion yet. Only collect coverage in softirq context for now. */ - if (in_serving_softirq()) - kcov_remote_start_usb((u64)urb->dev->bus->busnum); + kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum); urb->complete(urb); - if (in_serving_softirq()) - kcov_remote_stop(); + kcov_remote_stop_softirq(); usb_anchor_resume_wakeups(anchor); atomic_dec(&urb->use_count); diff --git a/include/linux/kcov.h b/include/linux/kcov.h index a10e84707d82..4e3037dc1204 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -52,6 +52,25 @@ static inline void kcov_remote_start_usb(u64 id) kcov_remote_start(kcov_remote_handle(KCOV_SUBSYSTEM_USB, id)); } +/* + * The softirq flavor of kcov_remote_*() functions is introduced as a temporary + * work around for kcov's lack of nested remote coverage sections support in + * task context. Adding suport for nested sections is tracked in: + * https://bugzilla.kernel.org/show_bug.cgi?id=210337 + */ + +static inline void kcov_remote_start_usb_softirq(u64 id) +{ + if (in_serving_softirq()) + kcov_remote_start_usb(id); +} + +static inline void kcov_remote_stop_softirq(void) +{ + if (in_serving_softirq()) + kcov_remote_stop(); +} + #else static inline void kcov_task_init(struct task_struct *t) {} @@ -66,6 +85,8 @@ static inline u64 kcov_common_handle(void) } static inline void kcov_remote_start_common(u64 id) {} static inline void kcov_remote_start_usb(u64 id) {} +static inline void kcov_remote_start_usb_softirq(u64 id) {} +static inline void kcov_remote_stop_softirq(void) {} #endif /* CONFIG_KCOV */ #endif /* _LINUX_KCOV_H */ From e2459108b5a0604c4b472cae2b3cb8d3444c77fb Mon Sep 17 00:00:00 2001 From: "taehyun.cho" Date: Thu, 7 Jan 2021 00:46:25 +0900 Subject: [PATCH 433/547] usb: gadget: enable super speed plus Enable Super speed plus in configfs to support USB3.1 Gen2. This ensures that when a USB gadget is plugged in, it is enumerated as Gen 2 and connected at 10 Gbps if the host and cable are capable of it. Many in-tree gadget functions (fs, midi, acm, ncm, mass_storage, etc.) already have SuperSpeed Plus support. Tested: plugged gadget into Linux host and saw: [284907.385986] usb 8-2: new SuperSpeedPlus Gen 2 USB device number 3 using xhci_hcd Tested-by: Lorenzo Colitti Acked-by: Felipe Balbi Signed-off-by: taehyun.cho Signed-off-by: Lorenzo Colitti Link: https://lore.kernel.org/r/20210106154625.2801030-1-lorenzo@google.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/configfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 408a5332a975..36ffb43f9c1a 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -1543,7 +1543,7 @@ static const struct usb_gadget_driver configfs_driver_template = { .suspend = configfs_composite_suspend, .resume = configfs_composite_resume, - .max_speed = USB_SPEED_SUPER, + .max_speed = USB_SPEED_SUPER_PLUS, .driver = { .owner = THIS_MODULE, .name = "configfs-gadget", @@ -1583,7 +1583,7 @@ static struct config_group *gadgets_make( gi->composite.unbind = configfs_do_nothing; gi->composite.suspend = NULL; gi->composite.resume = NULL; - gi->composite.max_speed = USB_SPEED_SUPER; + gi->composite.max_speed = USB_SPEED_SUPER_PLUS; spin_lock_init(&gi->spinlock); mutex_init(&gi->lock); From 41952a66015466c3208aac96b14ffd92e0943589 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Wed, 6 Jan 2021 00:16:05 +0000 Subject: [PATCH 434/547] usb: typec: Fix copy paste error for NVIDIA alt-mode description The name of the module for the NVIDIA alt-mode is incorrect as it looks to be a copy-paste error from the entry above, update it to the correct typec_nvidia module name. Cc: Ajay Gupta Cc: Heikki Krogerus Signed-off-by: Peter Robinson Link: https://lore.kernel.org/r/20210106001605.167917-1-pbrobinson@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/altmodes/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/altmodes/Kconfig b/drivers/usb/typec/altmodes/Kconfig index 187690fd1a5b..60d375e9c3c7 100644 --- a/drivers/usb/typec/altmodes/Kconfig +++ b/drivers/usb/typec/altmodes/Kconfig @@ -20,6 +20,6 @@ config TYPEC_NVIDIA_ALTMODE to enable support for VirtualLink devices with NVIDIA GPUs. To compile this driver as a module, choose M here: the - module will be called typec_displayport. + module will be called typec_nvidia. endmenu From 6c75c2bad36cfb43b144e6a0a76a69993c72097f Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Wed, 6 Jan 2021 19:49:04 -0800 Subject: [PATCH 435/547] usb: typec: Send uevent for num_altmodes update Generate a change uevent when the "number_of_alternate_modes" sysfs file for partners and plugs is updated by a port driver. Cc: Heikki Krogerus Cc: Benson Leung Signed-off-by: Prashant Malani Link: https://lore.kernel.org/r/20210107034904.4112029-1-pmalani@chromium.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/class.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c index ebfd3113a9a8..8f77669f9cf4 100644 --- a/drivers/usb/typec/class.c +++ b/drivers/usb/typec/class.c @@ -766,6 +766,7 @@ int typec_partner_set_num_altmodes(struct typec_partner *partner, int num_altmod return ret; sysfs_notify(&partner->dev.kobj, NULL, "number_of_alternate_modes"); + kobject_uevent(&partner->dev.kobj, KOBJ_CHANGE); return 0; } @@ -923,6 +924,7 @@ int typec_plug_set_num_altmodes(struct typec_plug *plug, int num_altmodes) return ret; sysfs_notify(&plug->dev.kobj, NULL, "number_of_alternate_modes"); + kobject_uevent(&plug->dev.kobj, KOBJ_CHANGE); return 0; } From a5c7682aaaa10e42928d73de1c9e1e02d2b14c2e Mon Sep 17 00:00:00 2001 From: Thinh Nguyen Date: Mon, 4 Jan 2021 22:42:39 -0800 Subject: [PATCH 436/547] usb: dwc3: gadget: Clear wait flag on dequeue If an active transfer is dequeued, then the endpoint is freed to start a new transfer. Make sure to clear the endpoint's transfer wait flag for this case. Fixes: e0d19563eb6c ("usb: dwc3: gadget: Wait for transfer completion") Cc: stable@vger.kernel.org Acked-by: Felipe Balbi Signed-off-by: Thinh Nguyen Link: https://lore.kernel.org/r/b81cd5b5281cfbfdadb002c4bcf5c9be7c017cfd.1609828485.git.Thinh.Nguyen@synopsys.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 25f654b79e48..ee44321fee38 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1763,6 +1763,8 @@ static int dwc3_gadget_ep_dequeue(struct usb_ep *ep, list_for_each_entry_safe(r, t, &dep->started_list, list) dwc3_gadget_move_cancelled_request(r); + dep->flags &= ~DWC3_EP_WAIT_TRANSFER_COMPLETE; + goto out; } } From e0658f970a7f3d85431c6803b7d5169444fb11b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Tue, 5 Jan 2021 18:55:47 +0100 Subject: [PATCH 437/547] drm/radeon: stop re-init the TTM page pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drivers are not supposed to init the page pool directly any more. Signed-off-by: Christian König Reviewed-by: Huang Rui Link: https://patchwork.freedesktop.org/patch/412153/ --- drivers/gpu/drm/radeon/radeon_ttm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index d59ef6e92a40..23195d5d4e91 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -730,9 +730,6 @@ int radeon_ttm_init(struct radeon_device *rdev) } rdev->mman.initialized = true; - ttm_pool_init(&rdev->mman.bdev.pool, rdev->dev, rdev->need_swiotlb, - dma_addressing_limited(&rdev->pdev->dev)); - r = radeon_ttm_init_vram(rdev); if (r) { DRM_ERROR("Failed initializing VRAM heap.\n"); From a73858ef4d5e1d425e171f0f6a52864176a6a979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Tue, 5 Jan 2021 18:56:56 +0100 Subject: [PATCH 438/547] drm/ttm: unexport ttm_pool_init/fini MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drivers are not supposed to use this directly any more. Signed-off-by: Christian König Reviewed-by: Huang Rui Link: https://patchwork.freedesktop.org/patch/412156/ --- drivers/gpu/drm/ttm/ttm_pool.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index 7b2f60616750..a00b7ab9c14c 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -507,7 +507,6 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev, ttm_pool_type_init(&pool->caching[i].orders[j], pool, i, j); } -EXPORT_SYMBOL(ttm_pool_init); /** * ttm_pool_fini - Cleanup a pool @@ -525,7 +524,6 @@ void ttm_pool_fini(struct ttm_pool *pool) for (j = 0; j < MAX_ORDER; ++j) ttm_pool_type_fini(&pool->caching[i].orders[j]); } -EXPORT_SYMBOL(ttm_pool_fini); #ifdef CONFIG_DEBUG_FS /* Count the number of pages available in a pool_type */ From 1efd17e7acb6692bffc6c58718f41f27fdfd62f5 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 31 Dec 2020 08:53:19 +0800 Subject: [PATCH 439/547] iommu/vt-d: Fix misuse of ALIGN in qi_flush_piotlb() Use IS_ALIGNED() instead. Otherwise, an unaligned address will be ignored. Fixes: 33cd6e642d6a ("iommu/vt-d: Flush PASID-based iotlb for iova over first level") Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20201231005323.2178523-1-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/dmar.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index b46dbfa6d0ed..004feaed3c72 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1461,8 +1461,8 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, int mask = ilog2(__roundup_pow_of_two(npages)); unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); - if (WARN_ON_ONCE(!ALIGN(addr, align))) - addr &= ~(align - 1); + if (WARN_ON_ONCE(!IS_ALIGNED(addr, align))) + addr = ALIGN_DOWN(addr, align); desc.qw0 = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) | From 4df7b2268ad81a74168130e1fb04550a8bc980e1 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 31 Dec 2020 08:53:22 +0800 Subject: [PATCH 440/547] Revert "iommu: Add quirk for Intel graphic devices in map_sg" This reverts commit 65f746e8285f0a67d43517d86fedb9e29ead49f2. As commit 8a473dbadccf ("drm/i915: Fix DMA mapped scatterlist walks") and commit 934941ed5a30 ("drm/i915: Fix DMA mapped scatterlist lookup") fixed the DMA scatterlist limitations in the i915 driver, remove this temporary workaround. Cc: Tvrtko Ursulin Cc: Tom Murphy Cc: Logan Gunthorpe Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20201231005323.2178523-4-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/dma-iommu.c | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index f0305e6aac1b..4078358ed66e 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -863,33 +863,6 @@ static int __finalise_sg(struct device *dev, struct scatterlist *sg, int nents, unsigned int cur_len = 0, max_len = dma_get_max_seg_size(dev); int i, count = 0; - /* - * The Intel graphic driver is used to assume that the returned - * sg list is not combound. This blocks the efforts of converting - * Intel IOMMU driver to dma-iommu api's. Add this quirk to make the - * device driver work and should be removed once it's fixed in i915 - * driver. - */ - if (IS_ENABLED(CONFIG_DRM_I915) && dev_is_pci(dev) && - to_pci_dev(dev)->vendor == PCI_VENDOR_ID_INTEL && - (to_pci_dev(dev)->class >> 16) == PCI_BASE_CLASS_DISPLAY) { - for_each_sg(sg, s, nents, i) { - unsigned int s_iova_off = sg_dma_address(s); - unsigned int s_length = sg_dma_len(s); - unsigned int s_iova_len = s->length; - - s->offset += s_iova_off; - s->length = s_length; - sg_dma_address(s) = dma_addr + s_iova_off; - sg_dma_len(s) = s_length; - dma_addr += s_iova_len; - - pr_info_once("sg combining disabled due to i915 driver\n"); - } - - return nents; - } - for_each_sg(sg, s, nents, i) { /* Restore this segment's original unaligned fields first */ unsigned int s_iova_off = sg_dma_address(s); From 420d42f6f9db27d88bc4f83e3e668fcdacbf7e29 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 31 Dec 2020 08:53:23 +0800 Subject: [PATCH 441/547] iommu/vt-d: Fix lockdep splat in sva bind()/unbind() Lock(&iommu->lock) without disabling irq causes lockdep warnings. ======================================================== WARNING: possible irq lock inversion dependency detected 5.11.0-rc1+ #828 Not tainted -------------------------------------------------------- kworker/0:1H/120 just changed the state of lock: ffffffffad9ea1b8 (device_domain_lock){..-.}-{2:2}, at: iommu_flush_dev_iotlb.part.0+0x32/0x120 but this lock took another, SOFTIRQ-unsafe lock in the past: (&iommu->lock){+.+.}-{2:2} and interrupts could create inverse lock ordering between them. other info that might help us debug this: Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&iommu->lock); local_irq_disable(); lock(device_domain_lock); lock(&iommu->lock); lock(device_domain_lock); *** DEADLOCK *** Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20201231005323.2178523-5-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/svm.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 4fa248b98031..9bcedd360235 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -281,6 +281,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, struct dmar_domain *dmar_domain; struct device_domain_info *info; struct intel_svm *svm = NULL; + unsigned long iflags; int ret = 0; if (WARN_ON(!iommu) || !data) @@ -381,12 +382,12 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, * each bind of a new device even with an existing PASID, we need to * call the nested mode setup function here. */ - spin_lock(&iommu->lock); + spin_lock_irqsave(&iommu->lock, iflags); ret = intel_pasid_setup_nested(iommu, dev, (pgd_t *)(uintptr_t)data->gpgd, data->hpasid, &data->vendor.vtd, dmar_domain, data->addr_width); - spin_unlock(&iommu->lock); + spin_unlock_irqrestore(&iommu->lock, iflags); if (ret) { dev_err_ratelimited(dev, "Failed to set up PASID %llu in nested mode, Err %d\n", data->hpasid, ret); @@ -486,6 +487,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags, struct device_domain_info *info; struct intel_svm_dev *sdev; struct intel_svm *svm = NULL; + unsigned long iflags; int pasid_max; int ret; @@ -605,14 +607,14 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags, } } - spin_lock(&iommu->lock); + spin_lock_irqsave(&iommu->lock, iflags); ret = intel_pasid_setup_first_level(iommu, dev, mm ? mm->pgd : init_mm.pgd, svm->pasid, FLPT_DEFAULT_DID, (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | (cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0)); - spin_unlock(&iommu->lock); + spin_unlock_irqrestore(&iommu->lock, iflags); if (ret) { if (mm) mmu_notifier_unregister(&svm->notifier, mm); @@ -632,14 +634,14 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags, * Binding a new device with existing PASID, need to setup * the PASID entry. */ - spin_lock(&iommu->lock); + spin_lock_irqsave(&iommu->lock, iflags); ret = intel_pasid_setup_first_level(iommu, dev, mm ? mm->pgd : init_mm.pgd, svm->pasid, FLPT_DEFAULT_DID, (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | (cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0)); - spin_unlock(&iommu->lock); + spin_unlock_irqrestore(&iommu->lock, iflags); if (ret) { kfree(sdev); goto out; From aded8c7c2b72f846a07a2c736b8e75bb8cf50a87 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 5 Jan 2021 16:50:38 -0800 Subject: [PATCH 442/547] iommu/arm-smmu-qcom: Initialize SCTLR of the bypass context On SM8150 it's occasionally observed that the boot hangs in between the writing of SMEs and context banks in arm_smmu_device_reset(). The problem seems to coincide with a display refresh happening after updating the stream mapping, but before clearing - and there by disabling translation - the context bank picked to emulate translation bypass. Resolve this by explicitly disabling the bypass context already in cfg_probe. Fixes: f9081b8ff593 ("iommu/arm-smmu-qcom: Implement S2CR quirk") Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20210106005038.4152731-1-bjorn.andersson@linaro.org Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 5dff7ffbef11..1b83d140742f 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -196,6 +196,8 @@ static int qcom_smmu_cfg_probe(struct arm_smmu_device *smmu) set_bit(qsmmu->bypass_cbndx, smmu->context_map); + arm_smmu_cb_write(smmu, qsmmu->bypass_cbndx, ARM_SMMU_CB_SCTLR, 0); + reg = FIELD_PREP(ARM_SMMU_CBAR_TYPE, CBAR_TYPE_S1_TRANS_S2_BYPASS); arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(qsmmu->bypass_cbndx), reg); } From 9ad9f45b3b91162b33abfe175ae75ab65718dbf5 Mon Sep 17 00:00:00 2001 From: Liu Yi L Date: Thu, 7 Jan 2021 00:03:55 +0800 Subject: [PATCH 443/547] iommu/vt-d: Move intel_iommu info from struct intel_svm to struct intel_svm_dev 'struct intel_svm' is shared by all devices bound to a give process, but records only a single pointer to a 'struct intel_iommu'. Consequently, cache invalidations may only be applied to a single DMAR unit, and are erroneously skipped for the other devices. In preparation for fixing this, rework the structures so that the iommu pointer resides in 'struct intel_svm_dev', allowing 'struct intel_svm' to track them in its device list. Fixes: 1c4f88b7f1f9 ("iommu/vt-d: Shared virtual address in scalable mode") Cc: Lu Baolu Cc: Jacob Pan Cc: Raj Ashok Cc: David Woodhouse Reported-by: Guo Kaijie Reported-by: Xin Zeng Signed-off-by: Guo Kaijie Signed-off-by: Xin Zeng Signed-off-by: Liu Yi L Tested-by: Guo Kaijie Cc: stable@vger.kernel.org # v5.0+ Acked-by: Lu Baolu Link: https://lore.kernel.org/r/1609949037-25291-2-git-send-email-yi.l.liu@intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/svm.c | 9 +++++---- include/linux/intel-iommu.h | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 9bcedd360235..790ef3497e7e 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -142,7 +142,7 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d } desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(svm->iommu, &desc, 1, 0); + qi_submit_sync(sdev->iommu, &desc, 1, 0); if (sdev->dev_iotlb) { desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) | @@ -166,7 +166,7 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d } desc.qw2 = 0; desc.qw3 = 0; - qi_submit_sync(svm->iommu, &desc, 1, 0); + qi_submit_sync(sdev->iommu, &desc, 1, 0); } } @@ -211,7 +211,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) */ rcu_read_lock(); list_for_each_entry_rcu(sdev, &svm->devs, list) - intel_pasid_tear_down_entry(svm->iommu, sdev->dev, + intel_pasid_tear_down_entry(sdev->iommu, sdev->dev, svm->pasid, true); rcu_read_unlock(); @@ -364,6 +364,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, } sdev->dev = dev; sdev->sid = PCI_DEVID(info->bus, info->devfn); + sdev->iommu = iommu; /* Only count users if device has aux domains */ if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)) @@ -548,6 +549,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags, goto out; } sdev->dev = dev; + sdev->iommu = iommu; ret = intel_iommu_enable_pasid(iommu, dev); if (ret) { @@ -577,7 +579,6 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags, kfree(sdev); goto out; } - svm->iommu = iommu; if (pasid_max > intel_pasid_max_id) pasid_max = intel_pasid_max_id; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index d956987ed032..94522685a0d9 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -758,6 +758,7 @@ struct intel_svm_dev { struct list_head list; struct rcu_head rcu; struct device *dev; + struct intel_iommu *iommu; struct svm_dev_ops *ops; struct iommu_sva sva; u32 pasid; @@ -771,7 +772,6 @@ struct intel_svm { struct mmu_notifier notifier; struct mm_struct *mm; - struct intel_iommu *iommu; unsigned int flags; u32 pasid; int gpasid; /* In case that guest PASID is different from host PASID */ From 18abda7a2d555783d28ea1701f3ec95e96237a86 Mon Sep 17 00:00:00 2001 From: Liu Yi L Date: Thu, 7 Jan 2021 00:03:56 +0800 Subject: [PATCH 444/547] iommu/vt-d: Fix general protection fault in aux_detach_device() The aux-domain attach/detach are not tracked, some data structures might be used after free. This causes general protection faults when multiple subdevices are created and assigned to a same guest machine: | general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] SMP NOPTI | RIP: 0010:intel_iommu_aux_detach_device+0x12a/0x1f0 | [...] | Call Trace: | iommu_aux_detach_device+0x24/0x70 | vfio_mdev_detach_domain+0x3b/0x60 | ? vfio_mdev_set_domain+0x50/0x50 | iommu_group_for_each_dev+0x4f/0x80 | vfio_iommu_detach_group.isra.0+0x22/0x30 | vfio_iommu_type1_detach_group.cold+0x71/0x211 | ? find_exported_symbol_in_section+0x4a/0xd0 | ? each_symbol_section+0x28/0x50 | __vfio_group_unset_container+0x4d/0x150 | vfio_group_try_dissolve_container+0x25/0x30 | vfio_group_put_external_user+0x13/0x20 | kvm_vfio_group_put_external_user+0x27/0x40 [kvm] | kvm_vfio_destroy+0x45/0xb0 [kvm] | kvm_put_kvm+0x1bb/0x2e0 [kvm] | kvm_vm_release+0x22/0x30 [kvm] | __fput+0xcc/0x260 | ____fput+0xe/0x10 | task_work_run+0x8f/0xb0 | do_exit+0x358/0xaf0 | ? wake_up_state+0x10/0x20 | ? signal_wake_up_state+0x1a/0x30 | do_group_exit+0x47/0xb0 | __x64_sys_exit_group+0x18/0x20 | do_syscall_64+0x57/0x1d0 | entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix the crash by tracking the subdevices when attaching and detaching aux-domains. Fixes: 67b8e02b5e76 ("iommu/vt-d: Aux-domain specific domain attach/detach") Co-developed-by: Xin Zeng Signed-off-by: Xin Zeng Signed-off-by: Liu Yi L Acked-by: Lu Baolu Link: https://lore.kernel.org/r/1609949037-25291-3-git-send-email-yi.l.liu@intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/iommu.c | 99 +++++++++++++++++++++++++++---------- include/linux/intel-iommu.h | 16 ++++-- 2 files changed, 84 insertions(+), 31 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 788119c5b021..d7720a836268 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1877,6 +1877,7 @@ static struct dmar_domain *alloc_domain(int flags) domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL; domain->has_iotlb_device = false; INIT_LIST_HEAD(&domain->devices); + INIT_LIST_HEAD(&domain->subdevices); return domain; } @@ -2547,7 +2548,7 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu, info->iommu = iommu; info->pasid_table = NULL; info->auxd_enabled = 0; - INIT_LIST_HEAD(&info->auxiliary_domains); + INIT_LIST_HEAD(&info->subdevices); if (dev && dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(info->dev); @@ -4475,33 +4476,61 @@ is_aux_domain(struct device *dev, struct iommu_domain *domain) domain->type == IOMMU_DOMAIN_UNMANAGED; } -static void auxiliary_link_device(struct dmar_domain *domain, - struct device *dev) +static inline struct subdev_domain_info * +lookup_subdev_info(struct dmar_domain *domain, struct device *dev) { - struct device_domain_info *info = get_domain_info(dev); + struct subdev_domain_info *sinfo; - assert_spin_locked(&device_domain_lock); - if (WARN_ON(!info)) - return; + if (!list_empty(&domain->subdevices)) { + list_for_each_entry(sinfo, &domain->subdevices, link_domain) { + if (sinfo->pdev == dev) + return sinfo; + } + } - domain->auxd_refcnt++; - list_add(&domain->auxd, &info->auxiliary_domains); + return NULL; } -static void auxiliary_unlink_device(struct dmar_domain *domain, - struct device *dev) +static int auxiliary_link_device(struct dmar_domain *domain, + struct device *dev) { struct device_domain_info *info = get_domain_info(dev); + struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); assert_spin_locked(&device_domain_lock); if (WARN_ON(!info)) - return; + return -EINVAL; - list_del(&domain->auxd); - domain->auxd_refcnt--; + if (!sinfo) { + sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC); + sinfo->domain = domain; + sinfo->pdev = dev; + list_add(&sinfo->link_phys, &info->subdevices); + list_add(&sinfo->link_domain, &domain->subdevices); + } - if (!domain->auxd_refcnt && domain->default_pasid > 0) - ioasid_put(domain->default_pasid); + return ++sinfo->users; +} + +static int auxiliary_unlink_device(struct dmar_domain *domain, + struct device *dev) +{ + struct device_domain_info *info = get_domain_info(dev); + struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev); + int ret; + + assert_spin_locked(&device_domain_lock); + if (WARN_ON(!info || !sinfo || sinfo->users <= 0)) + return -EINVAL; + + ret = --sinfo->users; + if (!ret) { + list_del(&sinfo->link_phys); + list_del(&sinfo->link_domain); + kfree(sinfo); + } + + return ret; } static int aux_domain_add_dev(struct dmar_domain *domain, @@ -4530,6 +4559,19 @@ static int aux_domain_add_dev(struct dmar_domain *domain, } spin_lock_irqsave(&device_domain_lock, flags); + ret = auxiliary_link_device(domain, dev); + if (ret <= 0) + goto link_failed; + + /* + * Subdevices from the same physical device can be attached to the + * same domain. For such cases, only the first subdevice attachment + * needs to go through the full steps in this function. So if ret > + * 1, just goto out. + */ + if (ret > 1) + goto out; + /* * iommu->lock must be held to attach domain to iommu and setup the * pasid entry for second level translation. @@ -4548,10 +4590,9 @@ static int aux_domain_add_dev(struct dmar_domain *domain, domain->default_pasid); if (ret) goto table_failed; + spin_unlock(&iommu->lock); - - auxiliary_link_device(domain, dev); - +out: spin_unlock_irqrestore(&device_domain_lock, flags); return 0; @@ -4560,8 +4601,10 @@ table_failed: domain_detach_iommu(domain, iommu); attach_failed: spin_unlock(&iommu->lock); + auxiliary_unlink_device(domain, dev); +link_failed: spin_unlock_irqrestore(&device_domain_lock, flags); - if (!domain->auxd_refcnt && domain->default_pasid > 0) + if (list_empty(&domain->subdevices) && domain->default_pasid > 0) ioasid_put(domain->default_pasid); return ret; @@ -4581,14 +4624,18 @@ static void aux_domain_remove_dev(struct dmar_domain *domain, info = get_domain_info(dev); iommu = info->iommu; - auxiliary_unlink_device(domain, dev); - - spin_lock(&iommu->lock); - intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false); - domain_detach_iommu(domain, iommu); - spin_unlock(&iommu->lock); + if (!auxiliary_unlink_device(domain, dev)) { + spin_lock(&iommu->lock); + intel_pasid_tear_down_entry(iommu, dev, + domain->default_pasid, false); + domain_detach_iommu(domain, iommu); + spin_unlock(&iommu->lock); + } spin_unlock_irqrestore(&device_domain_lock, flags); + + if (list_empty(&domain->subdevices) && domain->default_pasid > 0) + ioasid_put(domain->default_pasid); } static int prepare_domain_attach_device(struct iommu_domain *domain, diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 94522685a0d9..09c6a0bf3892 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -533,11 +533,10 @@ struct dmar_domain { /* Domain ids per IOMMU. Use u16 since * domain ids are 16 bit wide according * to VT-d spec, section 9.3 */ - unsigned int auxd_refcnt; /* Refcount of auxiliary attaching */ bool has_iotlb_device; struct list_head devices; /* all devices' list */ - struct list_head auxd; /* link to device's auxiliary list */ + struct list_head subdevices; /* all subdevices' list */ struct iova_domain iovad; /* iova's that belong to this domain */ struct dma_pte *pgd; /* virtual address */ @@ -610,14 +609,21 @@ struct intel_iommu { struct dmar_drhd_unit *drhd; }; +/* Per subdevice private data */ +struct subdev_domain_info { + struct list_head link_phys; /* link to phys device siblings */ + struct list_head link_domain; /* link to domain siblings */ + struct device *pdev; /* physical device derived from */ + struct dmar_domain *domain; /* aux-domain */ + int users; /* user count */ +}; + /* PCI domain-device relationship */ struct device_domain_info { struct list_head link; /* link to domain siblings */ struct list_head global; /* link to global list */ struct list_head table; /* link to pasid table */ - struct list_head auxiliary_domains; /* auxiliary domains - * attached to this device - */ + struct list_head subdevices; /* subdevices sibling */ u32 segment; /* PCI segment number */ u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ From 7c29ada5e70083805bc3a68daa23441df421fbee Mon Sep 17 00:00:00 2001 From: Liu Yi L Date: Thu, 7 Jan 2021 00:03:57 +0800 Subject: [PATCH 445/547] iommu/vt-d: Fix ineffective devTLB invalidation for subdevices iommu_flush_dev_iotlb() is called to invalidate caches on a device but only loops over the devices which are fully-attached to the domain. For sub-devices, this is ineffective and can result in invalid caching entries left on the device. Fix the missing invalidation by adding a loop over the subdevices and ensuring that 'domain->has_iotlb_device' is updated when attaching to subdevices. Fixes: 67b8e02b5e76 ("iommu/vt-d: Aux-domain specific domain attach/detach") Signed-off-by: Liu Yi L Acked-by: Lu Baolu Link: https://lore.kernel.org/r/1609949037-25291-4-git-send-email-yi.l.liu@intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/iommu.c | 53 ++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index d7720a836268..65cf06d70bf4 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -719,6 +719,8 @@ static int domain_update_device_node(struct dmar_domain *domain) return nid; } +static void domain_update_iotlb(struct dmar_domain *domain); + /* Some capabilities may be different across iommus */ static void domain_update_iommu_cap(struct dmar_domain *domain) { @@ -744,6 +746,8 @@ static void domain_update_iommu_cap(struct dmar_domain *domain) domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); else domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); + + domain_update_iotlb(domain); } struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, @@ -1464,17 +1468,22 @@ static void domain_update_iotlb(struct dmar_domain *domain) assert_spin_locked(&device_domain_lock); - list_for_each_entry(info, &domain->devices, link) { - struct pci_dev *pdev; - - if (!info->dev || !dev_is_pci(info->dev)) - continue; - - pdev = to_pci_dev(info->dev); - if (pdev->ats_enabled) { + list_for_each_entry(info, &domain->devices, link) + if (info->ats_enabled) { has_iotlb_device = true; break; } + + if (!has_iotlb_device) { + struct subdev_domain_info *sinfo; + + list_for_each_entry(sinfo, &domain->subdevices, link_domain) { + info = get_domain_info(sinfo->pdev); + if (info && info->ats_enabled) { + has_iotlb_device = true; + break; + } + } } domain->has_iotlb_device = has_iotlb_device; @@ -1555,25 +1564,37 @@ static void iommu_disable_dev_iotlb(struct device_domain_info *info) #endif } +static void __iommu_flush_dev_iotlb(struct device_domain_info *info, + u64 addr, unsigned int mask) +{ + u16 sid, qdep; + + if (!info || !info->ats_enabled) + return; + + sid = info->bus << 8 | info->devfn; + qdep = info->ats_qdep; + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, + qdep, addr, mask); +} + static void iommu_flush_dev_iotlb(struct dmar_domain *domain, u64 addr, unsigned mask) { - u16 sid, qdep; unsigned long flags; struct device_domain_info *info; + struct subdev_domain_info *sinfo; if (!domain->has_iotlb_device) return; spin_lock_irqsave(&device_domain_lock, flags); - list_for_each_entry(info, &domain->devices, link) { - if (!info->ats_enabled) - continue; + list_for_each_entry(info, &domain->devices, link) + __iommu_flush_dev_iotlb(info, addr, mask); - sid = info->bus << 8 | info->devfn; - qdep = info->ats_qdep; - qi_flush_dev_iotlb(info->iommu, sid, info->pfsid, - qdep, addr, mask); + list_for_each_entry(sinfo, &domain->subdevices, link_domain) { + info = get_domain_info(sinfo->pdev); + __iommu_flush_dev_iotlb(info, addr, mask); } spin_unlock_irqrestore(&device_domain_lock, flags); } From 80c18e4ac20c9cde420cb3ffab48c936147cf07d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Jan 2021 03:15:41 +0000 Subject: [PATCH 446/547] io_uring: trigger eventfd for IOPOLL Make sure io_iopoll_complete() tries to wake up eventfd, which currently is skipped together with io_cqring_ev_posted() for non-SQPOLL IOPOLL. Add an iopoll version of io_cqring_ev_posted(), duplicates a bit of code, but they actually use different sets of wait queues may be for better. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 27a8c226abf8..91e517ad1421 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1713,6 +1713,16 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) eventfd_signal(ctx->cq_ev_fd, 1); } +static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) +{ + if (ctx->flags & IORING_SETUP_SQPOLL) { + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); + } + if (io_should_trigger_evfd(ctx)) + eventfd_signal(ctx->cq_ev_fd, 1); +} + /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, struct task_struct *tsk, @@ -2428,8 +2438,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, } io_commit_cqring(ctx); - if (ctx->flags & IORING_SETUP_SQPOLL) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted_iopoll(ctx); io_req_free_batch_finish(ctx, &rb); if (!list_empty(&again)) From 4aa84f2ffa81f71e15e5cffc2cc6090dbee78f8e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Jan 2021 03:15:42 +0000 Subject: [PATCH 447/547] io_uring: dont kill fasync under completion_lock CPU0 CPU1 ---- ---- lock(&new->fa_lock); local_irq_disable(); lock(&ctx->completion_lock); lock(&new->fa_lock); lock(&ctx->completion_lock); *** DEADLOCK *** Move kill_fasync() out of io_commit_cqring() to io_cqring_ev_posted(), so it doesn't hold completion_lock while doing it. That saves from the reported deadlock, and it's just nice to shorten the locking time and untangle nested locks (compl_lock -> wq_head::lock). Reported-by: syzbot+91ca3f25bd7f795f019c@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 91e517ad1421..401316fe2ae2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1345,11 +1345,6 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) /* order cqe stores with ring update */ smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); - - if (wq_has_sleeper(&ctx->cq_wait)) { - wake_up_interruptible(&ctx->cq_wait); - kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); - } } static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req) @@ -1711,6 +1706,10 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); + } } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) @@ -1721,6 +1720,10 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) } if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); + } } /* Returns true if there are no backlogged entries after the flush */ From b1445e59cc9a10fdb8f83810ae1f4feb941ab36b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Jan 2021 03:15:43 +0000 Subject: [PATCH 448/547] io_uring: synchronise ev_posted() with waitqueues waitqueue_active() needs smp_mb() to be in sync with waitqueues modification, but we miss it in io_cqring_ev_posted*() apart from cq_wait() case. Take an smb_mb() out of wq_has_sleeper() making it waitqueue_active(), and place it a few lines before, so it can synchronise other waitqueue_active() as well. The patch doesn't add any additional overhead, so even if there are no problems currently, it's just safer to have it this way. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 401316fe2ae2..cb57e0360fcb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1700,13 +1700,16 @@ static inline unsigned __io_cqring_events(struct io_ring_ctx *ctx) static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { + /* see waitqueue_active() comment */ + smp_mb(); + if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (wq_has_sleeper(&ctx->cq_wait)) { + if (waitqueue_active(&ctx->cq_wait)) { wake_up_interruptible(&ctx->cq_wait); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); } @@ -1714,13 +1717,16 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { + /* see waitqueue_active() comment */ + smp_mb(); + if (ctx->flags & IORING_SETUP_SQPOLL) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); } if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (wq_has_sleeper(&ctx->cq_wait)) { + if (waitqueue_active(&ctx->cq_wait)) { wake_up_interruptible(&ctx->cq_wait); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); } From 04a6a536bc3fd1436fc78c546c6b3ecdccbfaf6d Mon Sep 17 00:00:00 2001 From: Satya Tangirala Date: Thu, 24 Dec 2020 04:49:54 +0000 Subject: [PATCH 449/547] fs: Fix freeze_bdev()/thaw_bdev() accounting of bd_fsfreeze_sb freeze/thaw_bdev() currently use bdev->bd_fsfreeze_count to infer whether or not bdev->bd_fsfreeze_sb is valid (it's valid iff bd_fsfreeze_count is non-zero). thaw_bdev() doesn't nullify bd_fsfreeze_sb. But this means a freeze_bdev() call followed by a thaw_bdev() call can leave bd_fsfreeze_sb with a non-null value, while bd_fsfreeze_count is zero. If freeze_bdev() is called again, and this time get_active_super() returns NULL (e.g. because the FS is unmounted), we'll end up with bd_fsfreeze_count > 0, but bd_fsfreeze_sb is *untouched* - it stays the same (now garbage) value. A subsequent thaw_bdev() will decide that the bd_fsfreeze_sb value is legitimate (since bd_fsfreeze_count > 0), and attempt to use it. Fix this by always setting bd_fsfreeze_sb to NULL when bd_fsfreeze_count is successfully decremented to 0 in thaw_sb(). Alternatively, we could set bd_fsfreeze_sb to whatever get_active_super() returns in freeze_bdev() whenever bd_fsfreeze_count is successfully incremented to 1 from 0 (which can be achieved cleanly by moving the line currently setting bd_fsfreeze_sb to immediately after the "sync:" label, but it might be a little too subtle/easily overlooked in future). This fixes the currently panicking xfstests generic/085. Fixes: 040f04bd2e82 ("fs: simplify freeze_bdev/thaw_bdev") Signed-off-by: Satya Tangirala Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/block_dev.c b/fs/block_dev.c index 3e5b02f6606c..e454c5a81043 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -605,6 +605,8 @@ int thaw_bdev(struct block_device *bdev) error = thaw_super(sb); if (error) bdev->bd_fsfreeze_count++; + else + bdev->bd_fsfreeze_sb = NULL; out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; From 17ffd35809c34b9564edb10727d02eb62958ba5c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 5 Jan 2021 19:20:29 +0100 Subject: [PATCH 450/547] cpufreq: intel_pstate: Use HWP capabilities in intel_cpufreq_adjust_perf() If turbo P-states cannot be used, either due to the configuration of the processor, or because intel_pstate is not allowed to used them, the maximum available P-state with HWP enabled corresponds to the HWP_CAP.GUARANTEED value which is not static. It can be adjusted by an out-of-band agent or during an Intel Speed Select performance level change, so long as it remains less than or equal to HWP_CAP.MAX. However, if turbo P-states cannot be used, intel_cpufreq_adjust_perf() always uses pstate.max_pstate (set during the initialization of the driver only) as the maximum available P-state, so it may miss a change of the HWP_CAP.GUARANTEED value. Prevent that from happening by modifyig intel_cpufreq_adjust_perf() to always read the "guaranteed" and "maximum turbo" performance levels from the cached HWP_CAP value. Fixes: a365ab6b9dfb ("cpufreq: intel_pstate: Implement the ->adjust_perf() callback") Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada --- drivers/cpufreq/intel_pstate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 1a660466dd75..32bc11851b8d 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2653,12 +2653,13 @@ static void intel_cpufreq_adjust_perf(unsigned int cpunum, unsigned long capacity) { struct cpudata *cpu = all_cpu_data[cpunum]; + u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached); int old_pstate = cpu->pstate.current_pstate; int cap_pstate, min_pstate, max_pstate, target_pstate; update_turbo_state(); - cap_pstate = global.turbo_disabled ? cpu->pstate.max_pstate : - cpu->pstate.turbo_pstate; + cap_pstate = global.turbo_disabled ? HWP_GUARANTEED_PERF(hwp_cap) : + HWP_HIGHEST_PERF(hwp_cap); /* Optimization: Avoid unnecessary divisions. */ From 943bdd0cecad06da8392a33093230e30e501eccc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 5 Jan 2021 10:19:57 +0000 Subject: [PATCH 451/547] cpufreq: powernow-k8: pass policy rather than use cpufreq_cpu_get() Currently there is an unlikely case where cpufreq_cpu_get() returns a NULL policy and this will cause a NULL pointer dereference later on. Fix this by passing the policy to transition_frequency_fidvid() from the caller and hence eliminating the need for the cpufreq_cpu_get() and cpufreq_cpu_put(). Thanks to Viresh Kumar for suggesting the fix. Addresses-Coverity: ("Dereference null return") Fixes: b43a7ffbf33b ("cpufreq: Notify all policy->cpus in cpufreq_notify_transition()") Suggested-by: Viresh Kumar Signed-off-by: Colin Ian King Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/powernow-k8.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c index 0acc9e241cd7..b9ccb6a3dad9 100644 --- a/drivers/cpufreq/powernow-k8.c +++ b/drivers/cpufreq/powernow-k8.c @@ -878,9 +878,9 @@ static int get_transition_latency(struct powernow_k8_data *data) /* Take a frequency, and issue the fid/vid transition command */ static int transition_frequency_fidvid(struct powernow_k8_data *data, - unsigned int index) + unsigned int index, + struct cpufreq_policy *policy) { - struct cpufreq_policy *policy; u32 fid = 0; u32 vid = 0; int res; @@ -912,9 +912,6 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, freqs.old = find_khz_freq_from_fid(data->currfid); freqs.new = find_khz_freq_from_fid(fid); - policy = cpufreq_cpu_get(smp_processor_id()); - cpufreq_cpu_put(policy); - cpufreq_freq_transition_begin(policy, &freqs); res = transition_fid_vid(data, fid, vid); cpufreq_freq_transition_end(policy, &freqs, res); @@ -969,7 +966,7 @@ static long powernowk8_target_fn(void *arg) powernow_k8_acpi_pst_values(data, newstate); - ret = transition_frequency_fidvid(data, newstate); + ret = transition_frequency_fidvid(data, newstate, pol); if (ret) { pr_err("transition frequency failed\n"); From aa7a1bb02bb44399be69b0a1cbb6495d9eec29fc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 5 Jan 2021 19:19:18 +0100 Subject: [PATCH 452/547] ACPI: PM: s2idle: Drop unused local variables and related code Two local variables in drivers/acpi/x86/s2idle.c are never read, so drop them along with the code updating their values (in vain). Fixes: fef98671194b ("ACPI: PM: s2idle: Move x86-specific code to the x86 directory") Signed-off-by: Rafael J. Wysocki --- drivers/acpi/x86/s2idle.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c index 25fea34b544c..2b69536cdccb 100644 --- a/drivers/acpi/x86/s2idle.c +++ b/drivers/acpi/x86/s2idle.c @@ -105,18 +105,8 @@ static void lpi_device_get_constraints_amd(void) for (i = 0; i < out_obj->package.count; i++) { union acpi_object *package = &out_obj->package.elements[i]; - struct lpi_device_info_amd info = { }; - if (package->type == ACPI_TYPE_INTEGER) { - switch (i) { - case 0: - info.revision = package->integer.value; - break; - case 1: - info.count = package->integer.value; - break; - } - } else if (package->type == ACPI_TYPE_PACKAGE) { + if (package->type == ACPI_TYPE_PACKAGE) { lpi_constraints_table = kcalloc(package->package.count, sizeof(*lpi_constraints_table), GFP_KERNEL); @@ -135,12 +125,10 @@ static void lpi_device_get_constraints_amd(void) for (k = 0; k < info_obj->package.count; ++k) { union acpi_object *obj = &info_obj->package.elements[k]; - union acpi_object *obj_new; list = &lpi_constraints_table[lpi_constraints_table_size]; list->min_dstate = -1; - obj_new = &obj[k]; switch (k) { case 0: dev_info.enabled = obj->integer.value; From ee61cfd955a64a58ed35cbcfc54068fcbd486945 Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Thu, 31 Dec 2020 19:35:25 +0800 Subject: [PATCH 453/547] ACPI: scan: add stub acpi_create_platform_device() for !CONFIG_ACPI It adds a stub acpi_create_platform_device() for !CONFIG_ACPI build, so that caller doesn't have to deal with !CONFIG_ACPI build issue. Reported-by: kernel test robot Signed-off-by: Shawn Guo Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 2630c2e953f7..053bf05fb1f7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -885,6 +885,13 @@ static inline int acpi_device_modalias(struct device *dev, return -ENODEV; } +static inline struct platform_device * +acpi_create_platform_device(struct acpi_device *adev, + struct property_entry *properties) +{ + return NULL; +} + static inline bool acpi_dma_supported(struct acpi_device *adev) { return false; From 240bdc605e6a9d0309bd003de3413f6f729eca18 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Tue, 29 Dec 2020 11:17:59 +0000 Subject: [PATCH 454/547] ACPI: Update Kconfig help text for items that are no longer modular The CONTAINER and HOTPLUG_MEMORY options mention modules but are bool only, so if selected are always built in. Drop the help text about modules. Signed-off-by: Peter Robinson Signed-off-by: Rafael J. Wysocki --- drivers/acpi/Kconfig | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index edf1558c1105..ebcf534514be 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -395,9 +395,6 @@ config ACPI_CONTAINER This helps support hotplug of nodes, CPUs, and memory. - To compile this driver as a module, choose M here: - the module will be called container. - config ACPI_HOTPLUG_MEMORY bool "Memory Hotplug" depends on MEMORY_HOTPLUG @@ -411,9 +408,6 @@ config ACPI_HOTPLUG_MEMORY removing memory devices at runtime, you need not enable this driver. - To compile this driver as a module, choose M here: - the module will be called acpi_memhotplug. - config ACPI_HOTPLUG_IOAPIC bool depends on PCI From 47f4469970d8861bc06d2d4d45ac8200ff07c693 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Tue, 5 Jan 2021 17:11:45 +0800 Subject: [PATCH 455/547] Revert "device property: Keep secondary firmware node secondary by type" While commit d5dcce0c414f ("device property: Keep secondary firmware node secondary by type") describes everything correct in its commit message, the change it made does the opposite and original commit c15e1bdda436 ("device property: Fix the secondary firmware node handling in set_primary_fwnode()") was fully correct. Revert the former one here and improve documentation in the next patch. Fixes: d5dcce0c414f ("device property: Keep secondary firmware node secondary by type") Signed-off-by: Bard Liao Reviewed-by: Andy Shevchenko Reviewed-by: Heikki Krogerus Cc: 5.10+ # 5.10+ Signed-off-by: Rafael J. Wysocki --- drivers/base/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 25e08e5f40bd..51b9545a050b 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -4433,7 +4433,7 @@ void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode) if (fwnode_is_primary(fn)) { dev->fwnode = fn->secondary; if (!(parent && fn == parent->fwnode)) - fn->secondary = ERR_PTR(-ENODEV); + fn->secondary = NULL; } else { dev->fwnode = NULL; } From 3f7bddaf5d5a83aa2eb1e6d72db221d3ec43c813 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Tue, 5 Jan 2021 17:11:46 +0800 Subject: [PATCH 456/547] device property: add description of fwnode cases There are only four valid fwnode cases which are - primary --> secondary --> -ENODEV - primary --> NULL - secondary --> -ENODEV - NULL dev->fwnode should be converted between the 4 cases above no matter how/when set_primary_fwnode() and set_secondary_fwnode() are called. Describe it in the code so people will keep it in mind. Signed-off-by: Bard Liao Reviewed-by: Andy Shevchenko Reviewed-by: Heikki Krogerus [ rjw: Comment edit ] Signed-off-by: Rafael J. Wysocki --- drivers/base/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/base/core.c b/drivers/base/core.c index 51b9545a050b..14f165816742 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -4414,6 +4414,12 @@ static inline bool fwnode_is_primary(struct fwnode_handle *fwnode) * * Set the device's firmware node pointer to @fwnode, but if a secondary * firmware node of the device is present, preserve it. + * + * Valid fwnode cases are: + * - primary --> secondary --> -ENODEV + * - primary --> NULL + * - secondary --> -ENODEV + * - NULL */ void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode) { @@ -4432,6 +4438,7 @@ void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode) } else { if (fwnode_is_primary(fn)) { dev->fwnode = fn->secondary; + /* Set fn->secondary = NULL, so fn remains the primary fwnode */ if (!(parent && fn == parent->fwnode)) fn->secondary = NULL; } else { From 2b5f09cadfc576817c0450e01d454f750909b103 Mon Sep 17 00:00:00 2001 From: Kuogee Hsieh Date: Fri, 18 Dec 2020 09:53:40 -0800 Subject: [PATCH 457/547] drm/msm/dp: postpone irq_hpd event during connection pending state irq_hpd event can only be executed at connected state. Therefore irq_hpd event should be postponed if it happened at connection pending state. This patch also make sure both link rate and lane are valid before start link training. Signed-off-by: Kuogee Hsieh Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dp/dp_display.c | 7 +++++++ drivers/gpu/drm/msm/dp/dp_panel.c | 14 ++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index 6e971d552911..3bc7ed21de28 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -693,6 +693,13 @@ static int dp_irq_hpd_handle(struct dp_display_private *dp, u32 data) return 0; } + if (state == ST_CONNECT_PENDING) { + /* wait until ST_CONNECTED */ + dp_add_event(dp, EV_IRQ_HPD_INT, 0, 1); /* delay = 1 */ + mutex_unlock(&dp->event_mutex); + return 0; + } + ret = dp_display_usbpd_attention_cb(&dp->pdev->dev); if (ret == -ECONNRESET) { /* cable unplugged */ dp->core_initialized = false; diff --git a/drivers/gpu/drm/msm/dp/dp_panel.c b/drivers/gpu/drm/msm/dp/dp_panel.c index 97dca3e378b7..d1780bcac8cc 100644 --- a/drivers/gpu/drm/msm/dp/dp_panel.c +++ b/drivers/gpu/drm/msm/dp/dp_panel.c @@ -167,14 +167,20 @@ int dp_panel_read_sink_caps(struct dp_panel *dp_panel, panel = container_of(dp_panel, struct dp_panel_private, dp_panel); rc = dp_panel_read_dpcd(dp_panel); - bw_code = drm_dp_link_rate_to_bw_code(dp_panel->link_info.rate); - if (rc || !is_link_rate_valid(bw_code) || - !is_lane_count_valid(dp_panel->link_info.num_lanes) || - (bw_code > dp_panel->max_bw_code)) { + if (rc) { DRM_ERROR("read dpcd failed %d\n", rc); return rc; } + bw_code = drm_dp_link_rate_to_bw_code(dp_panel->link_info.rate); + if (!is_link_rate_valid(bw_code) || + !is_lane_count_valid(dp_panel->link_info.num_lanes) || + (bw_code > dp_panel->max_bw_code)) { + DRM_ERROR("Illegal link rate=%d lane=%d\n", dp_panel->link_info.rate, + dp_panel->link_info.num_lanes); + return -EINVAL; + } + if (dp_panel->dfp_present) { rlen = drm_dp_dpcd_read(panel->aux, DP_SINK_COUNT, &count, 1); From d863f0c7b536288e2bd40cbc01c10465dd226b11 Mon Sep 17 00:00:00 2001 From: Craig Tatlor Date: Wed, 30 Dec 2020 17:29:42 +0200 Subject: [PATCH 458/547] drm/msm: Call msm_init_vram before binding the gpu vram.size is needed when binding a gpu without an iommu and is defined in msm_init_vram(), so run that before binding it. Signed-off-by: Craig Tatlor Reviewed-by: Brian Masney Tested-by: Alexey Minnekhanov Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_drv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index ce9bb6e929c2..549ffb60e9ca 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -457,15 +457,15 @@ static int msm_drm_init(struct device *dev, struct drm_driver *drv) drm_mode_config_init(ddev); + ret = msm_init_vram(ddev); + if (ret) + goto err_destroy_mdss; + /* Bind all our sub-components: */ ret = component_bind_all(dev, ddev); if (ret) goto err_destroy_mdss; - ret = msm_init_vram(ddev); - if (ret) - goto err_msm_uninit; - dma_set_max_seg_size(dev, UINT_MAX); msm_gem_shrinker_init(ddev); From 3f7759e7b7585a0bffda06d4eddc6b0b850ef6c3 Mon Sep 17 00:00:00 2001 From: Iskren Chernev Date: Wed, 30 Dec 2020 17:29:43 +0200 Subject: [PATCH 459/547] drm/msm: Add modparam to allow vram carveout Using the GPU with a VRAM Carveout is a security vulnerability. Nevertheless it is sometimes required, especially when no IOMMU implementation is available for a certain platform. Signed-off-by: Iskren Chernev Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a2xx_gpu.c | 6 ++++-- drivers/gpu/drm/msm/adreno/a3xx_gpu.c | 6 ++++-- drivers/gpu/drm/msm/adreno/a4xx_gpu.c | 6 ++++-- drivers/gpu/drm/msm/adreno/adreno_device.c | 4 ++++ drivers/gpu/drm/msm/adreno/adreno_gpu.h | 1 + 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a2xx_gpu.c b/drivers/gpu/drm/msm/adreno/a2xx_gpu.c index 7e82c41a85f1..bdc989183c64 100644 --- a/drivers/gpu/drm/msm/adreno/a2xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a2xx_gpu.c @@ -534,8 +534,10 @@ struct msm_gpu *a2xx_gpu_init(struct drm_device *dev) if (!gpu->aspace) { dev_err(dev->dev, "No memory protection without MMU\n"); - ret = -ENXIO; - goto fail; + if (!allow_vram_carveout) { + ret = -ENXIO; + goto fail; + } } return gpu; diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c index 93da6683a866..4534633fe7cd 100644 --- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c @@ -564,8 +564,10 @@ struct msm_gpu *a3xx_gpu_init(struct drm_device *dev) * implement a cmdstream validator. */ DRM_DEV_ERROR(dev->dev, "No memory protection without IOMMU\n"); - ret = -ENXIO; - goto fail; + if (!allow_vram_carveout) { + ret = -ENXIO; + goto fail; + } } icc_path = devm_of_icc_get(&pdev->dev, "gfx-mem"); diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c index c0be3a0f36b2..82bebb40234d 100644 --- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c @@ -692,8 +692,10 @@ struct msm_gpu *a4xx_gpu_init(struct drm_device *dev) * implement a cmdstream validator. */ DRM_DEV_ERROR(dev->dev, "No memory protection without IOMMU\n"); - ret = -ENXIO; - goto fail; + if (!allow_vram_carveout) { + ret = -ENXIO; + goto fail; + } } icc_path = devm_of_icc_get(&pdev->dev, "gfx-mem"); diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 87c8b033ad1a..12e75ba360f9 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -18,6 +18,10 @@ bool snapshot_debugbus = false; MODULE_PARM_DESC(snapshot_debugbus, "Include debugbus sections in GPU devcoredump (if not fused off)"); module_param_named(snapshot_debugbus, snapshot_debugbus, bool, 0600); +bool allow_vram_carveout = false; +MODULE_PARM_DESC(allow_vram_carveout, "Allow using VRAM Carveout, in place of IOMMU"); +module_param_named(allow_vram_carveout, allow_vram_carveout, bool, 0600); + static const struct adreno_info gpulist[] = { { .rev = ADRENO_REV(2, 0, 0, 0), diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index c3775f79525a..fe5444a1482a 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -18,6 +18,7 @@ #include "adreno_pm4.xml.h" extern bool snapshot_debugbus; +extern bool allow_vram_carveout; enum { ADRENO_FW_PM4 = 0, From c4151604f0603d5700072183a05828ff87d764e4 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 21 Dec 2020 06:13:20 +0100 Subject: [PATCH 460/547] cpufreq: intel_pstate: remove obsolete functions percent_fp() was used in intel_pstate_pid_reset(), which was removed in commit 9d0ef7af1f2d ("cpufreq: intel_pstate: Do not use PID-based P-state selection") and hence, percent_fp() is unused since then. percent_ext_fp() was last used in intel_pstate_update_perf_limits(), which was refactored in commit 1a4fe38add8b ("cpufreq: intel_pstate: Remove max/min fractions to limit performance"), and hence, percent_ext_fp() is unused since then. make CC=clang W=1 points us those unused functions: drivers/cpufreq/intel_pstate.c:79:23: warning: unused function 'percent_fp' [-Wunused-function] static inline int32_t percent_fp(int percent) ^ drivers/cpufreq/intel_pstate.c:94:23: warning: unused function 'percent_ext_fp' [-Wunused-function] static inline int32_t percent_ext_fp(int percent) ^ Remove those obsolete functions. Signed-off-by: Lukas Bulwahn Reviewed-by: Nathan Chancellor Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 32bc11851b8d..be05e038d956 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -76,11 +76,6 @@ static inline int ceiling_fp(int32_t x) return ret; } -static inline int32_t percent_fp(int percent) -{ - return div_fp(percent, 100); -} - static inline u64 mul_ext_fp(u64 x, u64 y) { return (x * y) >> EXT_FRAC_BITS; @@ -91,11 +86,6 @@ static inline u64 div_ext_fp(u64 x, u64 y) return div64_u64(x << EXT_FRAC_BITS, y); } -static inline int32_t percent_ext_fp(int percent) -{ - return div_ext_fp(percent, 100); -} - /** * struct sample - Store performance sample * @core_avg_perf: Ratio of APERF/MPERF which is the actual average From 00fd44a1a4700718d5d962432b55c09820f7e709 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Mon, 4 Jan 2021 20:30:41 +0100 Subject: [PATCH 461/547] drm/msm: Only enable A6xx LLCC code on A6xx Using this code on A5xx (and probably older too) causes a smmu bug. Fixes: 474dadb8b0d5 ("drm/msm/a6xx: Add support for using system cache(LLC)") Signed-off-by: Konrad Dybcio Tested-by: AngeloGioacchino Del Regno Reviewed-by: Jordan Crouse Reviewed-by: Sai Prakash Ranjan Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 21 ++++++++++++--------- drivers/gpu/drm/msm/adreno/adreno_gpu.h | 5 +++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index 6cf9975e951e..f09175698827 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -191,8 +191,6 @@ adreno_iommu_create_address_space(struct msm_gpu *gpu, struct platform_device *pdev) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); - struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); - struct io_pgtable_domain_attr pgtbl_cfg; struct iommu_domain *iommu; struct msm_mmu *mmu; struct msm_gem_address_space *aspace; @@ -202,13 +200,18 @@ adreno_iommu_create_address_space(struct msm_gpu *gpu, if (!iommu) return NULL; - /* - * This allows GPU to set the bus attributes required to use system - * cache on behalf of the iommu page table walker. - */ - if (!IS_ERR(a6xx_gpu->htw_llc_slice)) { - pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA; - iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, &pgtbl_cfg); + + if (adreno_is_a6xx(adreno_gpu)) { + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); + struct io_pgtable_domain_attr pgtbl_cfg; + /* + * This allows GPU to set the bus attributes required to use system + * cache on behalf of the iommu page table walker. + */ + if (!IS_ERR(a6xx_gpu->htw_llc_slice)) { + pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA; + iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, &pgtbl_cfg); + } } mmu = msm_iommu_new(&pdev->dev, iommu); diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index fe5444a1482a..b3d9a333591b 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -212,6 +212,11 @@ static inline int adreno_is_a540(struct adreno_gpu *gpu) return gpu->revn == 540; } +static inline bool adreno_is_a6xx(struct adreno_gpu *gpu) +{ + return ((gpu->revn < 700 && gpu->revn > 599)); +} + static inline int adreno_is_a618(struct adreno_gpu *gpu) { return gpu->revn == 618; From afded6d83aa7b35dab675c730528109cc58d6847 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 28 Dec 2020 20:43:13 +0200 Subject: [PATCH 462/547] misc: pvpanic: Check devm_ioport_map() for NULL Inconveniently devm_ioport_map() and devm_ioremap_resource() return errors differently, i.e. former uses simply NULL pointer, while the latter an error pointer. Due to this, we have to check each of them separately. Fixes: f104060813fe ("misc: pvpanic: Combine ACPI and platform drivers") Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20201228184313.57610-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/pvpanic.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/misc/pvpanic.c b/drivers/misc/pvpanic.c index 951b37da5e3c..41cab297d66e 100644 --- a/drivers/misc/pvpanic.c +++ b/drivers/misc/pvpanic.c @@ -55,12 +55,23 @@ static int pvpanic_mmio_probe(struct platform_device *pdev) struct resource *res; res = platform_get_mem_or_io(pdev, 0); - if (res && resource_type(res) == IORESOURCE_IO) + if (!res) + return -EINVAL; + + switch (resource_type(res)) { + case IORESOURCE_IO: base = devm_ioport_map(dev, res->start, resource_size(res)); - else + if (!base) + return -ENOMEM; + break; + case IORESOURCE_MEM: base = devm_ioremap_resource(dev, res); - if (IS_ERR(base)) - return PTR_ERR(base); + if (IS_ERR(base)) + return PTR_ERR(base); + break; + default: + return -EINVAL; + } atomic_notifier_chain_register(&panic_notifier_list, &pvpanic_panic_nb); From d8f5c29653c3f6995e8979be5623d263e92f6b86 Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Tue, 5 Jan 2021 16:22:25 -0800 Subject: [PATCH 463/547] net: ipv6: fib: flush exceptions when purging route Route removal is handled by two code paths. The main removal path is via fib6_del_route() which will handle purging any PMTU exceptions from the cache, removing all per-cpu copies of the DST entry used by the route, and releasing the fib6_info struct. The second removal location is during fib6_add_rt2node() during a route replacement operation. This path also calls fib6_purge_rt() to handle cleaning up the per-cpu copies of the DST entries and releasing the fib6_info associated with the older route, but it does not flush any PMTU exceptions that the older route had. Since the older route is removed from the tree during the replacement, we lose any way of accessing it again. As these lingering DSTs and the fib6_info struct are holding references to the underlying netdevice struct as well, unregistering that device from the kernel can never complete. Fixes: 2b760fcf5cfb3 ("ipv6: hook up exception table to store dst cache") Signed-off-by: Sean Tranchetti Reviewed-by: David Ahern Link: https://lore.kernel.org/r/1609892546-11389-1-git-send-email-stranche@quicinc.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 605cdd38a919..f43e27555725 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1025,6 +1025,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, { struct fib6_table *table = rt->fib6_table; + /* Flush all cached dst in exception table */ + rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); if (rt->nh && !list_empty(&rt->nh_list)) @@ -1927,9 +1929,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; - /* Flush all cached dst in exception table */ - rt6_flush_exceptions(rt); - /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; From 5316a7c0130acf09bfc8bb0092407006010fcccc Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Tue, 5 Jan 2021 16:22:26 -0800 Subject: [PATCH 464/547] tools: selftests: add test for changing routes with PTMU exceptions Adds new 2 new tests to the PTMU script: pmtu_ipv4/6_route_change. These tests explicitly test for a recently discovered problem in the IPv6 routing framework where PMTU exceptions were not properly released when replacing a route via "ip route change ...". After creating PMTU exceptions, the route from the device A to R1 will be replaced with a new route, then device A will be deleted. If the PMTU exceptions were properly cleaned up by the kernel, this device deletion will succeed. Otherwise, the unregistration of the device will stall, and messages such as the following will be logged in dmesg: unregister_netdevice: waiting for veth_A-R1 to become free. Usage count = 4 Signed-off-by: Sean Tranchetti Reviewed-by: David Ahern Link: https://lore.kernel.org/r/1609892546-11389-2-git-send-email-stranche@quicinc.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/pmtu.sh | 71 ++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 464e31eabc73..64cd2e23c568 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -162,7 +162,15 @@ # - list_flush_ipv6_exception # Using the same topology as in pmtu_ipv6, create exceptions, and check # they are shown when listing exception caches, gone after flushing them - +# +# - pmtu_ipv4_route_change +# Use the same topology as in pmtu_ipv4, but issue a route replacement +# command and delete the corresponding device afterward. This tests for +# proper cleanup of the PMTU exceptions by the route replacement path. +# Device unregistration should complete successfully +# +# - pmtu_ipv6_route_change +# Same as above but with IPv6 # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 @@ -224,7 +232,9 @@ tests=" cleanup_ipv4_exception ipv4: cleanup of cached exceptions 1 cleanup_ipv6_exception ipv6: cleanup of cached exceptions 1 list_flush_ipv4_exception ipv4: list and flush cached exceptions 1 - list_flush_ipv6_exception ipv6: list and flush cached exceptions 1" + list_flush_ipv6_exception ipv6: list and flush cached exceptions 1 + pmtu_ipv4_route_change ipv4: PMTU exception w/route replace 1 + pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1" NS_A="ns-A" NS_B="ns-B" @@ -1782,6 +1792,63 @@ test_list_flush_ipv6_exception() { return ${fail} } +test_pmtu_ipvX_route_change() { + family=${1} + + setup namespaces routing || return 2 + trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ + "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \ + "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \ + "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2 + + if [ ${family} -eq 4 ]; then + ping=ping + dst1="${prefix4}.${b_r1}.1" + dst2="${prefix4}.${b_r2}.1" + gw="${prefix4}.${a_r1}.2" + else + ping=${ping6} + dst1="${prefix6}:${b_r1}::1" + dst2="${prefix6}:${b_r2}::1" + gw="${prefix6}:${a_r1}::2" + fi + + # Set up initial MTU values + mtu "${ns_a}" veth_A-R1 2000 + mtu "${ns_r1}" veth_R1-A 2000 + mtu "${ns_r1}" veth_R1-B 1400 + mtu "${ns_b}" veth_B-R1 1400 + + mtu "${ns_a}" veth_A-R2 2000 + mtu "${ns_r2}" veth_R2-A 2000 + mtu "${ns_r2}" veth_R2-B 1500 + mtu "${ns_b}" veth_B-R2 1500 + + # Create route exceptions + run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst1} + run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst2} + + # Check that exceptions have been created with the correct PMTU + pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})" + check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1 + pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})" + check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1 + + # Replace the route from A to R1 + run_cmd ${ns_a} ip route change default via ${gw} + + # Delete the device in A + run_cmd ${ns_a} ip link del "veth_A-R1" +} + +test_pmtu_ipv4_route_change() { + test_pmtu_ipvX_route_change 4 +} + +test_pmtu_ipv6_route_change() { + test_pmtu_ipvX_route_change 6 +} + usage() { echo echo "$0 [OPTIONS] [TEST]..." From 384b77fd48fd683a82760bc88bef8611cba997fc Mon Sep 17 00:00:00 2001 From: Amanoel Dawod Date: Sat, 26 Dec 2020 18:58:40 -0500 Subject: [PATCH 465/547] Fonts: font_ter16x32: Update font with new upstream Terminus release This is just a maintenance patch to update font_ter16x32.c with changes and minor fixes added in new upstream Terminus v4.49. >From release notes of new version 4.49, this brings: - Altered ascii grave in some sizes to be more useful as a back quote. - Fixed 21B5, added 21B2 and 21B3. Just as my initial submission of the font, above changes were obtained from new ter-i32b.psf font source. Terminus font sources are available for download at SourceForge: https://sourceforge.net/projects/terminus-font/files/terminus-font-4.49/ Simply running `make` in source directory will build the .psf font files. Signed-off-by: Amanoel Dawod Link: https://lore.kernel.org/r/20201226235840.26290-1-kernel@amanoeldawod.com Signed-off-by: Greg Kroah-Hartman --- lib/fonts/font_ter16x32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/fonts/font_ter16x32.c b/lib/fonts/font_ter16x32.c index 1955d624177c..5baedc573dd6 100644 --- a/lib/fonts/font_ter16x32.c +++ b/lib/fonts/font_ter16x32.c @@ -774,8 +774,8 @@ static const struct font_data fontdata_ter16x32 = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0xfc, 0x7f, 0xfc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */ - 0x00, 0x00, 0x1c, 0x00, 0x0e, 0x00, 0x07, 0x00, - 0x03, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x0e, 0x00, + 0x07, 0x00, 0x03, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -1169,7 +1169,7 @@ static const struct font_data fontdata_ter16x32 = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x7f, 0xf8, 0x7f, 0xfc, 0x03, 0x9e, 0x03, 0x8e, + 0x7e, 0xf8, 0x7f, 0xfc, 0x03, 0x9e, 0x03, 0x8e, 0x03, 0x8e, 0x3f, 0x8e, 0x7f, 0xfe, 0xf3, 0xfe, 0xe3, 0x80, 0xe3, 0x80, 0xe3, 0x80, 0xf3, 0xce, 0x7f, 0xfe, 0x3e, 0xfc, 0x00, 0x00, 0x00, 0x00, From abf8ef953a43e74aac3c54a94975f21bd483199b Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 30 Nov 2020 04:38:11 +0200 Subject: [PATCH 466/547] net/mlx5: Check if lag is supported before creating one MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes a memleak issue by preventing to create a lag and add PFs if lag is not supported. comm “python3”, pid 349349, jiffies 4296985507 (age 1446.976s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ……………. 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ……………. backtrace: [<000000005b216ae7>] mlx5_lag_add+0x1d5/0×3f0 [mlx5_core] [<000000000445aa55>] mlx5e_nic_enable+0x66/0×1b0 [mlx5_core] [<00000000c56734c3>] mlx5e_attach_netdev+0x16e/0×200 [mlx5_core] [<0000000030439d1f>] mlx5e_attach+0x5c/0×90 [mlx5_core] [<0000000018fd8615>] mlx5e_add+0x1a4/0×410 [mlx5_core] [<0000000068bc504b>] mlx5_add_device+0x72/0×120 [mlx5_core] [<000000009fce51f9>] mlx5_register_device+0x77/0xb0 [mlx5_core] [<00000000d0d81ff3>] mlx5_load_one+0xc58/0×1eb0 [mlx5_core] [<0000000045077adc>] init_one+0x3ea/0×920 [mlx5_core] [<0000000043287674>] pci_device_probe+0xcd/0×150 [<00000000dafd3279>] really_probe+0x1c9/0×4b0 [<00000000f06bdd84>] driver_probe_device+0x5d/0×140 [<00000000e3d508b6>] device_driver_attach+0x4f/0×60 [<0000000084fba0f0>] bind_store+0xbf/0×120 [<00000000bf6622b3>] kernfs_fop_write+0x114/0×1b0 Fixes: 9b412cc35f00 ("net/mlx5e: Add LAG warning if bond slave is not lag master") Signed-off-by: Mark Zhang Reviewed-by: Leon Romanovsky Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lag.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c index f3d45ef082cd..83a05371e2aa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c @@ -564,7 +564,9 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev) struct mlx5_core_dev *tmp_dev; int i, err; - if (!MLX5_CAP_GEN(dev, vport_group_manager)) + if (!MLX5_CAP_GEN(dev, vport_group_manager) || + !MLX5_CAP_GEN(dev, lag_master) || + MLX5_CAP_GEN(dev, num_lag_ports) != MLX5_MAX_PORTS) return; tmp_dev = mlx5_get_next_phys_dev(dev); @@ -582,12 +584,9 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev) if (mlx5_lag_dev_add_pf(ldev, dev, netdev) < 0) return; - for (i = 0; i < MLX5_MAX_PORTS; i++) { - tmp_dev = ldev->pf[i].dev; - if (!tmp_dev || !MLX5_CAP_GEN(tmp_dev, lag_master) || - MLX5_CAP_GEN(tmp_dev, num_lag_ports) != MLX5_MAX_PORTS) + for (i = 0; i < MLX5_MAX_PORTS; i++) + if (!ldev->pf[i].dev) break; - } if (i >= MLX5_MAX_PORTS) ldev->flags |= MLX5_LAG_FLAG_READY; From 9c9be85f6b59d80efe4705109c0396df18d4e11d Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 24 Nov 2020 22:16:23 +0200 Subject: [PATCH 467/547] net/mlx5e: Add missing capability check for uplink follow Expose firmware indication that it supports setting eswitch uplink state to follow (follow the physical link). Condition setting the eswitch uplink admin-state with this capability bit. Older FW may not support the uplink state setting. Fixes: 7d0314b11cdd ("net/mlx5e: Modify uplink state on interface up/down") Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 ++- include/linux/mlx5/mlx5_ifc.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 7a79d330c075..6a852b4901aa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3161,7 +3161,8 @@ static void mlx5e_modify_admin_state(struct mlx5_core_dev *mdev, mlx5_set_port_admin_status(mdev, state); - if (mlx5_eswitch_mode(mdev) != MLX5_ESWITCH_LEGACY) + if (mlx5_eswitch_mode(mdev) == MLX5_ESWITCH_OFFLOADS || + !MLX5_CAP_GEN(mdev, uplink_follow)) return; if (state == MLX5_PORT_UP) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 8fbddec26eb8..442c0160caab 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1280,7 +1280,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 ece_support[0x1]; u8 reserved_at_a4[0x7]; u8 log_max_srq[0x5]; - u8 reserved_at_b0[0x2]; + u8 reserved_at_b0[0x1]; + u8 uplink_follow[0x1]; u8 ts_cqe_to_dest_cqn[0x1]; u8 reserved_at_b3[0xd]; From 0f2dcade69f2af56b74bce432e48ff3957830ce2 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 14 Dec 2020 03:38:40 +0200 Subject: [PATCH 468/547] net/mlx5: Use port_num 1 instead of 0 when delete a RoCE address In multi-port mode, FW reports syndrome 0x2ea48 (invalid vhca_port_number) if the port_num is not 1 or 2. Fixes: 80f09dfc237f ("net/mlx5: Eswitch, enable RoCE loopback traffic") Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c index 0fc7de4aa572..8e0dddc6383f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c @@ -116,7 +116,7 @@ free: static void mlx5_rdma_del_roce_addr(struct mlx5_core_dev *dev) { mlx5_core_roce_gid_set(dev, 0, 0, 0, - NULL, NULL, false, 0, 0); + NULL, NULL, false, 0, 1); } static void mlx5_rdma_make_default_gid(struct mlx5_core_dev *dev, union ib_gid *gid) From eed38eeee734756596e2cc163bdc7dac3be501b1 Mon Sep 17 00:00:00 2001 From: Oz Shlomo Date: Mon, 7 Dec 2020 08:15:18 +0000 Subject: [PATCH 469/547] net/mlx5e: CT: Use per flow counter when CT flow accounting is enabled Connection counters may be shared for both directions when the counter is used for connection aging purposes. However, if TC flow accounting is enabled then a unique counter is required per direction. Instantiate a unique counter per direction if the conntrack accounting extension is enabled. Use a shared counter when the connection accounting extension is disabled. Fixes: 1edae2335adf ("net/mlx5e: CT: Use the same counter for both directions") Signed-off-by: Oz Shlomo Reported-by: Marcelo Ricardo Leitner Reviewed-by: Roi Dayan Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en/tc_ct.c | 77 ++++++++++++------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index e521254d886e..072363e73f1c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -118,16 +118,17 @@ struct mlx5_ct_tuple { u16 zone; }; -struct mlx5_ct_shared_counter { +struct mlx5_ct_counter { struct mlx5_fc *counter; refcount_t refcount; + bool is_shared; }; struct mlx5_ct_entry { struct rhash_head node; struct rhash_head tuple_node; struct rhash_head tuple_nat_node; - struct mlx5_ct_shared_counter *shared_counter; + struct mlx5_ct_counter *counter; unsigned long cookie; unsigned long restore_cookie; struct mlx5_ct_tuple tuple; @@ -394,13 +395,14 @@ mlx5_tc_ct_set_tuple_match(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec, } static void -mlx5_tc_ct_shared_counter_put(struct mlx5_tc_ct_priv *ct_priv, struct mlx5_ct_entry *entry) +mlx5_tc_ct_counter_put(struct mlx5_tc_ct_priv *ct_priv, struct mlx5_ct_entry *entry) { - if (!refcount_dec_and_test(&entry->shared_counter->refcount)) + if (entry->counter->is_shared && + !refcount_dec_and_test(&entry->counter->refcount)) return; - mlx5_fc_destroy(ct_priv->dev, entry->shared_counter->counter); - kfree(entry->shared_counter); + mlx5_fc_destroy(ct_priv->dev, entry->counter->counter); + kfree(entry->counter); } static void @@ -699,7 +701,7 @@ mlx5_tc_ct_entry_add_rule(struct mlx5_tc_ct_priv *ct_priv, attr->dest_ft = ct_priv->post_ct; attr->ft = nat ? ct_priv->ct_nat : ct_priv->ct; attr->outer_match_level = MLX5_MATCH_L4; - attr->counter = entry->shared_counter->counter; + attr->counter = entry->counter->counter; attr->flags |= MLX5_ESW_ATTR_FLAG_NO_IN_PORT; mlx5_tc_ct_set_tuple_match(netdev_priv(ct_priv->netdev), spec, flow_rule); @@ -732,13 +734,34 @@ err_attr: return err; } -static struct mlx5_ct_shared_counter * +static struct mlx5_ct_counter * +mlx5_tc_ct_counter_create(struct mlx5_tc_ct_priv *ct_priv) +{ + struct mlx5_ct_counter *counter; + int ret; + + counter = kzalloc(sizeof(*counter), GFP_KERNEL); + if (!counter) + return ERR_PTR(-ENOMEM); + + counter->is_shared = false; + counter->counter = mlx5_fc_create(ct_priv->dev, true); + if (IS_ERR(counter->counter)) { + ct_dbg("Failed to create counter for ct entry"); + ret = PTR_ERR(counter->counter); + kfree(counter); + return ERR_PTR(ret); + } + + return counter; +} + +static struct mlx5_ct_counter * mlx5_tc_ct_shared_counter_get(struct mlx5_tc_ct_priv *ct_priv, struct mlx5_ct_entry *entry) { struct mlx5_ct_tuple rev_tuple = entry->tuple; - struct mlx5_ct_shared_counter *shared_counter; - struct mlx5_core_dev *dev = ct_priv->dev; + struct mlx5_ct_counter *shared_counter; struct mlx5_ct_entry *rev_entry; __be16 tmp_port; int ret; @@ -767,25 +790,20 @@ mlx5_tc_ct_shared_counter_get(struct mlx5_tc_ct_priv *ct_priv, rev_entry = rhashtable_lookup_fast(&ct_priv->ct_tuples_ht, &rev_tuple, tuples_ht_params); if (rev_entry) { - if (refcount_inc_not_zero(&rev_entry->shared_counter->refcount)) { + if (refcount_inc_not_zero(&rev_entry->counter->refcount)) { mutex_unlock(&ct_priv->shared_counter_lock); - return rev_entry->shared_counter; + return rev_entry->counter; } } mutex_unlock(&ct_priv->shared_counter_lock); - shared_counter = kzalloc(sizeof(*shared_counter), GFP_KERNEL); - if (!shared_counter) - return ERR_PTR(-ENOMEM); - - shared_counter->counter = mlx5_fc_create(dev, true); - if (IS_ERR(shared_counter->counter)) { - ct_dbg("Failed to create counter for ct entry"); - ret = PTR_ERR(shared_counter->counter); - kfree(shared_counter); + shared_counter = mlx5_tc_ct_counter_create(ct_priv); + if (IS_ERR(shared_counter)) { + ret = PTR_ERR(shared_counter); return ERR_PTR(ret); } + shared_counter->is_shared = true; refcount_set(&shared_counter->refcount, 1); return shared_counter; } @@ -798,10 +816,13 @@ mlx5_tc_ct_entry_add_rules(struct mlx5_tc_ct_priv *ct_priv, { int err; - entry->shared_counter = mlx5_tc_ct_shared_counter_get(ct_priv, entry); - if (IS_ERR(entry->shared_counter)) { - err = PTR_ERR(entry->shared_counter); - ct_dbg("Failed to create counter for ct entry"); + if (nf_ct_acct_enabled(dev_net(ct_priv->netdev))) + entry->counter = mlx5_tc_ct_counter_create(ct_priv); + else + entry->counter = mlx5_tc_ct_shared_counter_get(ct_priv, entry); + + if (IS_ERR(entry->counter)) { + err = PTR_ERR(entry->counter); return err; } @@ -820,7 +841,7 @@ mlx5_tc_ct_entry_add_rules(struct mlx5_tc_ct_priv *ct_priv, err_nat: mlx5_tc_ct_entry_del_rule(ct_priv, entry, false); err_orig: - mlx5_tc_ct_shared_counter_put(ct_priv, entry); + mlx5_tc_ct_counter_put(ct_priv, entry); return err; } @@ -918,7 +939,7 @@ mlx5_tc_ct_del_ft_entry(struct mlx5_tc_ct_priv *ct_priv, rhashtable_remove_fast(&ct_priv->ct_tuples_ht, &entry->tuple_node, tuples_ht_params); mutex_unlock(&ct_priv->shared_counter_lock); - mlx5_tc_ct_shared_counter_put(ct_priv, entry); + mlx5_tc_ct_counter_put(ct_priv, entry); } @@ -956,7 +977,7 @@ mlx5_tc_ct_block_flow_offload_stats(struct mlx5_ct_ft *ft, if (!entry) return -ENOENT; - mlx5_fc_query_cached(entry->shared_counter->counter, &bytes, &packets, &lastuse); + mlx5_fc_query_cached(entry->counter->counter, &bytes, &packets, &lastuse); flow_stats_update(&f->stats, bytes, packets, 0, lastuse, FLOW_ACTION_HW_STATS_DELAYED); From b544011f0e58ce43c40105468d6dc67f980a0c7a Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 13 Nov 2020 06:06:28 +0200 Subject: [PATCH 470/547] net/mlx5e: Fix SWP offsets when vlan inserted by driver In case WQE includes inline header the vlan is inserted by driver even if vlan offload is set. On geneve over vlan interface where software parser is used the SWP offsets should be updated according to the added vlan. Fixes: e3cfc7e6b7bd ("net/mlx5e: TX, Add geneve tunnel stateless offload support") Signed-off-by: Moshe Shemesh Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 9 +++++++++ .../net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h | 8 +++++--- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 9 +++++---- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 7943eb30b837..4880f2179273 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -371,6 +371,15 @@ struct mlx5e_swp_spec { u8 tun_l4_proto; }; +static inline void mlx5e_eseg_swp_offsets_add_vlan(struct mlx5_wqe_eth_seg *eseg) +{ + /* SWP offsets are in 2-bytes words */ + eseg->swp_outer_l3_offset += VLAN_HLEN / 2; + eseg->swp_outer_l4_offset += VLAN_HLEN / 2; + eseg->swp_inner_l3_offset += VLAN_HLEN / 2; + eseg->swp_inner_l4_offset += VLAN_HLEN / 2; +} + static inline void mlx5e_set_eseg_swp(struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg, struct mlx5e_swp_spec *swp_spec) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h index 899b98aca0d3..1fae7fab8297 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h @@ -51,7 +51,7 @@ static inline bool mlx5_geneve_tx_allowed(struct mlx5_core_dev *mdev) } static inline void -mlx5e_tx_tunnel_accel(struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg) +mlx5e_tx_tunnel_accel(struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg, u16 ihs) { struct mlx5e_swp_spec swp_spec = {}; unsigned int offset = 0; @@ -85,6 +85,8 @@ mlx5e_tx_tunnel_accel(struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg) } mlx5e_set_eseg_swp(skb, eseg, &swp_spec); + if (skb_vlan_tag_present(skb) && ihs) + mlx5e_eseg_swp_offsets_add_vlan(eseg); } #else @@ -163,7 +165,7 @@ static inline unsigned int mlx5e_accel_tx_ids_len(struct mlx5e_txqsq *sq, static inline bool mlx5e_accel_tx_eseg(struct mlx5e_priv *priv, struct sk_buff *skb, - struct mlx5_wqe_eth_seg *eseg) + struct mlx5_wqe_eth_seg *eseg, u16 ihs) { #ifdef CONFIG_MLX5_EN_IPSEC if (xfrm_offload(skb)) @@ -172,7 +174,7 @@ static inline bool mlx5e_accel_tx_eseg(struct mlx5e_priv *priv, #if IS_ENABLED(CONFIG_GENEVE) if (skb->encapsulation) - mlx5e_tx_tunnel_accel(skb, eseg); + mlx5e_tx_tunnel_accel(skb, eseg, ihs); #endif return true; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index e47e2a0059d0..61ed671fe741 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -682,9 +682,9 @@ void mlx5e_tx_mpwqe_ensure_complete(struct mlx5e_txqsq *sq) static bool mlx5e_txwqe_build_eseg(struct mlx5e_priv *priv, struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5e_accel_tx_state *accel, - struct mlx5_wqe_eth_seg *eseg) + struct mlx5_wqe_eth_seg *eseg, u16 ihs) { - if (unlikely(!mlx5e_accel_tx_eseg(priv, skb, eseg))) + if (unlikely(!mlx5e_accel_tx_eseg(priv, skb, eseg, ihs))) return false; mlx5e_txwqe_build_eseg_csum(sq, skb, accel, eseg); @@ -714,7 +714,8 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev) if (mlx5e_tx_skb_supports_mpwqe(skb, &attr)) { struct mlx5_wqe_eth_seg eseg = {}; - if (unlikely(!mlx5e_txwqe_build_eseg(priv, sq, skb, &accel, &eseg))) + if (unlikely(!mlx5e_txwqe_build_eseg(priv, sq, skb, &accel, &eseg, + attr.ihs))) return NETDEV_TX_OK; mlx5e_sq_xmit_mpwqe(sq, skb, &eseg, netdev_xmit_more()); @@ -731,7 +732,7 @@ netdev_tx_t mlx5e_xmit(struct sk_buff *skb, struct net_device *dev) /* May update the WQE, but may not post other WQEs. */ mlx5e_accel_tx_finish(sq, wqe, &accel, (struct mlx5_wqe_inline_seg *)(wqe->data + wqe_attr.ds_cnt_inl)); - if (unlikely(!mlx5e_txwqe_build_eseg(priv, sq, skb, &accel, &wqe->eth))) + if (unlikely(!mlx5e_txwqe_build_eseg(priv, sq, skb, &accel, &wqe->eth, attr.ihs))) return NETDEV_TX_OK; mlx5e_sq_xmit_wqe(sq, skb, &attr, &wqe_attr, wqe, pi, netdev_xmit_more()); From 25c904b59aaf4816337acd415514b0c47715f604 Mon Sep 17 00:00:00 2001 From: Alaa Hleihel Date: Mon, 4 Jan 2021 12:54:40 +0200 Subject: [PATCH 471/547] net/mlx5: E-Switch, fix changing vf VLANID Adding vf VLANID for the first time, or after having cleared previously defined VLANID works fine, however, attempting to change an existing vf VLANID clears the rules on the firmware, but does not add new rules for the new vf VLANID. Fix this by changing the logic in function esw_acl_egress_lgcy_setup() so that it will always configure egress rules. Fixes: ea651a86d468 ("net/mlx5: E-Switch, Refactor eswitch egress acl codes") Signed-off-by: Alaa Hleihel Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/esw/acl/egress_lgcy.c | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c index 2b85d4777303..3e19b1721303 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/egress_lgcy.c @@ -95,22 +95,21 @@ int esw_acl_egress_lgcy_setup(struct mlx5_eswitch *esw, return 0; } - if (!IS_ERR_OR_NULL(vport->egress.acl)) - return 0; + if (!vport->egress.acl) { + vport->egress.acl = esw_acl_table_create(esw, vport->vport, + MLX5_FLOW_NAMESPACE_ESW_EGRESS, + table_size); + if (IS_ERR(vport->egress.acl)) { + err = PTR_ERR(vport->egress.acl); + vport->egress.acl = NULL; + goto out; + } - vport->egress.acl = esw_acl_table_create(esw, vport->vport, - MLX5_FLOW_NAMESPACE_ESW_EGRESS, - table_size); - if (IS_ERR(vport->egress.acl)) { - err = PTR_ERR(vport->egress.acl); - vport->egress.acl = NULL; - goto out; + err = esw_acl_egress_lgcy_groups_create(esw, vport); + if (err) + goto out; } - err = esw_acl_egress_lgcy_groups_create(esw, vport); - if (err) - goto out; - esw_debug(esw->dev, "vport[%d] configure egress rules, vlan(%d) qos(%d)\n", vport->vport, vport->info.vlan, vport->info.qos); From e13ed0ac064dd6ee964155ba9fdc2f3c3785934c Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Mon, 14 Dec 2020 13:53:03 +0200 Subject: [PATCH 472/547] net/mlx5e: In skb build skip setting mark in switchdev mode sop_drop_qpn field in the cqe is used by two features, in SWITCHDEV mode to restore the chain id in case of a miss and in LEGACY mode to support skbedit mark action. In build RX skb, the skb mark field is set regardless of the configured mode which cause a corruption of the mark field in case of switchdev mode. Fix by overriding the mark value back to 0 in the representor tc update skb flow. Fixes: 8f1e0b97cc70 ("net/mlx5: E-Switch, Mark miss packets with new chain id mapping") Signed-off-by: Maor Dickman Reviewed-by: Raed Salem Reviewed-by: Oz Shlomo Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c index d29af7b9c695..76177f7c5ec2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -626,6 +626,11 @@ bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe, if (!reg_c0) return true; + /* If reg_c0 is not equal to the default flow tag then skb->mark + * is not supported and must be reset back to 0. + */ + skb->mark = 0; + priv = netdev_priv(skb->dev); esw = priv->mdev->priv.eswitch; From b1c0aca3d3ddeebeec57ada9c2df9ed647939249 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 27 Dec 2020 16:33:19 +0200 Subject: [PATCH 473/547] net/mlx5e: ethtool, Fix restriction of autoneg with 56G Prior to this patch, configuring speed to 50G with autoneg off over devices supporting 50G per lane failed. Support for 50G per lane introduced a new set of link-modes, on which driver always performed a speed validation as if only legacy link-modes were configured. Fix driver speed validation to force setting autoneg over 56G only if in legacy link-mode. Fixes: 3d7cadae51f1 ("net/mlx5e: ethtool, Fix analysis of speed setting") Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_ethtool.c | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index d9076d543104..2d37742a888c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1010,6 +1010,22 @@ static int mlx5e_get_link_ksettings(struct net_device *netdev, return mlx5e_ethtool_get_link_ksettings(priv, link_ksettings); } +static int mlx5e_speed_validate(struct net_device *netdev, bool ext, + const unsigned long link_modes, u8 autoneg) +{ + /* Extended link-mode has no speed limitations. */ + if (ext) + return 0; + + if ((link_modes & MLX5E_PROT_MASK(MLX5E_56GBASE_R4)) && + autoneg != AUTONEG_ENABLE) { + netdev_err(netdev, "%s: 56G link speed requires autoneg enabled\n", + __func__); + return -EINVAL; + } + return 0; +} + static u32 mlx5e_ethtool2ptys_adver_link(const unsigned long *link_modes) { u32 i, ptys_modes = 0; @@ -1103,13 +1119,9 @@ int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv, link_modes = autoneg == AUTONEG_ENABLE ? ethtool2ptys_adver_func(adver) : mlx5e_port_speed2linkmodes(mdev, speed, !ext); - if ((link_modes & MLX5E_PROT_MASK(MLX5E_56GBASE_R4)) && - autoneg != AUTONEG_ENABLE) { - netdev_err(priv->netdev, "%s: 56G link speed requires autoneg enabled\n", - __func__); - err = -EINVAL; + err = mlx5e_speed_validate(priv->netdev, ext, link_modes, autoneg); + if (err) goto out; - } link_modes = link_modes & eproto.cap; if (!link_modes) { From 4d8be21112f6fa2ac4b8a13f35866ad65b11d48c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 4 Jan 2021 10:08:36 +0200 Subject: [PATCH 474/547] net/mlx5: Release devlink object if adev fails Add missed freeing previously allocated devlink object. Fixes: a925b5e309c9 ("net/mlx5: Register mlx5 devices to auxiliary virtual bus") Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c08315b51fd3..ca6f2fc39ea0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1368,8 +1368,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id) MLX5_COREDEV_VF : MLX5_COREDEV_PF; dev->priv.adev_idx = mlx5_adev_idx_alloc(); - if (dev->priv.adev_idx < 0) - return dev->priv.adev_idx; + if (dev->priv.adev_idx < 0) { + err = dev->priv.adev_idx; + goto adev_init_err; + } err = mlx5_mdev_init(dev, prof_sel); if (err) @@ -1403,6 +1405,7 @@ pci_init_err: mlx5_mdev_uninit(dev); mdev_init_err: mlx5_adev_idx_free(dev->priv.adev_idx); +adev_init_err: mlx5_devlink_free(devlink); return err; From 7a6eb072a9548492ead086f3e820e9aac71c7138 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Mon, 28 Dec 2020 16:48:40 +0800 Subject: [PATCH 475/547] net/mlx5e: Fix two double free cases mlx5e_create_ttc_table_groups() frees ft->g on failure of kvzalloc(), but such failure will be caught by its caller in mlx5e_create_ttc_table() and ft->g will be freed again in mlx5e_destroy_flow_table(). The same issue also occurs in mlx5e_create_ttc_table_groups(). Set ft->g to NULL after kfree() to avoid double free. Fixes: 7b3722fa9ef6 ("net/mlx5e: Support RSS for GRE tunneled packets") Fixes: 33cfaaa8f36f ("net/mlx5e: Split the main flow steering table") Signed-off-by: Dinghao Liu Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index fa8149f6eb08..44a2dfbc3853 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -942,6 +942,7 @@ static int mlx5e_create_ttc_table_groups(struct mlx5e_ttc_table *ttc, in = kvzalloc(inlen, GFP_KERNEL); if (!in) { kfree(ft->g); + ft->g = NULL; return -ENOMEM; } @@ -1087,6 +1088,7 @@ static int mlx5e_create_inner_ttc_table_groups(struct mlx5e_ttc_table *ttc) in = kvzalloc(inlen, GFP_KERNEL); if (!in) { kfree(ft->g); + ft->g = NULL; return -ENOMEM; } From 5b0bb12c58ac7d22e05b5bfdaa30a116c8c32e32 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Mon, 21 Dec 2020 19:27:31 +0800 Subject: [PATCH 476/547] net/mlx5e: Fix memleak in mlx5e_create_l2_table_groups When mlx5_create_flow_group() fails, ft->g should be freed just like when kvzalloc() fails. The caller of mlx5e_create_l2_table_groups() does not catch this issue on failure, which leads to memleak. Fixes: 33cfaaa8f36f ("net/mlx5e: Split the main flow steering table") Signed-off-by: Dinghao Liu Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index 44a2dfbc3853..e02e5895703d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -1392,6 +1392,7 @@ err_destroy_groups: ft->g[ft->num_groups] = NULL; mlx5e_destroy_groups(ft); kvfree(in); + kfree(ft->g); return err; } From f3562f5e00bbae2a6b292941ec76a9140aa3b7dd Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 6 Jan 2021 17:17:35 +0100 Subject: [PATCH 477/547] docs: octeontx2: tune rst markup Commit 80b9414832a1 ("docs: octeontx2: Add Documentation for NPA health reporters") added new documentation with improper formatting for rst, and caused a few new warnings for make htmldocs in octeontx2.rst:169--202. Tune markup and formatting for better presentation in the HTML view. Signed-off-by: Lukas Bulwahn Acked-by: Randy Dunlap Acked-by: George Cherian Link: https://lore.kernel.org/r/20210106161735.21751-1-lukas.bulwahn@gmail.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2.rst | 62 +++++++++++-------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst b/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst index d3fcf536d14e..61e850460e18 100644 --- a/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst +++ b/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst @@ -164,46 +164,56 @@ Devlink health reporters NPA Reporters ------------- -The NPA reporters are responsible for reporting and recovering the following group of errors +The NPA reporters are responsible for reporting and recovering the following group of errors: + 1. GENERAL events + - Error due to operation of unmapped PF. - Error due to disabled alloc/free for other HW blocks (NIX, SSO, TIM, DPI and AURA). + 2. ERROR events + - Fault due to NPA_AQ_INST_S read or NPA_AQ_RES_S write. - AQ Doorbell Error. + 3. RAS events + - RAS Error Reporting for NPA_AQ_INST_S/NPA_AQ_RES_S. + 4. RVU events + - Error due to unmapped slot. -Sample Output -------------- -~# devlink health -pci/0002:01:00.0: - reporter hw_npa_intr - state healthy error 2872 recover 2872 last_dump_date 2020-12-10 last_dump_time 09:39:09 grace_period 0 auto_recover true auto_dump true - reporter hw_npa_gen - state healthy error 2872 recover 2872 last_dump_date 2020-12-11 last_dump_time 04:43:04 grace_period 0 auto_recover true auto_dump true - reporter hw_npa_err - state healthy error 2871 recover 2871 last_dump_date 2020-12-10 last_dump_time 09:39:17 grace_period 0 auto_recover true auto_dump true - reporter hw_npa_ras - state healthy error 0 recover 0 last_dump_date 2020-12-10 last_dump_time 09:32:40 grace_period 0 auto_recover true auto_dump true +Sample Output:: + + ~# devlink health + pci/0002:01:00.0: + reporter hw_npa_intr + state healthy error 2872 recover 2872 last_dump_date 2020-12-10 last_dump_time 09:39:09 grace_period 0 auto_recover true auto_dump true + reporter hw_npa_gen + state healthy error 2872 recover 2872 last_dump_date 2020-12-11 last_dump_time 04:43:04 grace_period 0 auto_recover true auto_dump true + reporter hw_npa_err + state healthy error 2871 recover 2871 last_dump_date 2020-12-10 last_dump_time 09:39:17 grace_period 0 auto_recover true auto_dump true + reporter hw_npa_ras + state healthy error 0 recover 0 last_dump_date 2020-12-10 last_dump_time 09:32:40 grace_period 0 auto_recover true auto_dump true Each reporter dumps the + - Error Type - Error Register value - Reason in words -For eg: -~# devlink health dump show pci/0002:01:00.0 reporter hw_npa_gen - NPA_AF_GENERAL: - NPA General Interrupt Reg : 1 - NIX0: free disabled RX -~# devlink health dump show pci/0002:01:00.0 reporter hw_npa_intr - NPA_AF_RVU: - NPA RVU Interrupt Reg : 1 - Unmap Slot Error -~# devlink health dump show pci/0002:01:00.0 reporter hw_npa_err - NPA_AF_ERR: - NPA Error Interrupt Reg : 4096 - AQ Doorbell Error +For example:: + + ~# devlink health dump show pci/0002:01:00.0 reporter hw_npa_gen + NPA_AF_GENERAL: + NPA General Interrupt Reg : 1 + NIX0: free disabled RX + ~# devlink health dump show pci/0002:01:00.0 reporter hw_npa_intr + NPA_AF_RVU: + NPA RVU Interrupt Reg : 1 + Unmap Slot Error + ~# devlink health dump show pci/0002:01:00.0 reporter hw_npa_err + NPA_AF_ERR: + NPA Error Interrupt Reg : 4096 + AQ Doorbell Error From 0ef597c3ac49a62e1a2c1c10f88dd76fde1e1636 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Tue, 5 Jan 2021 06:58:15 +0100 Subject: [PATCH 478/547] docs: remove mention of ENABLE_MUST_CHECK We removed ENABLE_MUST_CHECK in 196793946264 ("Compiler Attributes: remove CONFIG_ENABLE_MUST_CHECK"), so let's remove docs' mentions. At the same time, fix the outdated text related to ENABLE_WARN_DEPRECATED that wasn't removed in 3337d5cfe5e08 ("configs: get rid of obsolete CONFIG_ENABLE_WARN_DEPRECATED"). Finally, reflow the paragraph. Signed-off-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/r/20210105055815.GA5173@kernel.org Signed-off-by: Jonathan Corbet --- Documentation/process/4.Coding.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/process/4.Coding.rst b/Documentation/process/4.Coding.rst index c27e59d2f702..0825dc496f22 100644 --- a/Documentation/process/4.Coding.rst +++ b/Documentation/process/4.Coding.rst @@ -249,10 +249,8 @@ features; most of these are found in the "kernel hacking" submenu. Several of these options should be turned on for any kernel used for development or testing purposes. In particular, you should turn on: - - ENABLE_MUST_CHECK and FRAME_WARN to get an - extra set of warnings for problems like the use of deprecated interfaces - or ignoring an important return value from a function. The output - generated by these warnings can be verbose, but one need not worry about + - FRAME_WARN to get warnings for stack frames larger than a given amount. + The output generated can be verbose, but one need not worry about warnings from other parts of the kernel. - DEBUG_OBJECTS will add code to track the lifetime of various objects From a734a7235ef3768dd3c9b7034f663ae6b260375f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Fri, 1 Jan 2021 22:14:47 +0100 Subject: [PATCH 479/547] docs: binfmt-misc: Fix .rst formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "name below" is not part of the /proc path and should not be formatted in monospace. "doesn``t" is rendered in HTML with a double backtick. Revert it back to "doesn't". Signed-off-by: Jonathan Neuschäfer Link: https://lore.kernel.org/r/20210101211447.1021412-1-j.neuschaefer@gmx.net Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/binfmt-misc.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/binfmt-misc.rst b/Documentation/admin-guide/binfmt-misc.rst index 7a864131e5ea..59cd902e3549 100644 --- a/Documentation/admin-guide/binfmt-misc.rst +++ b/Documentation/admin-guide/binfmt-misc.rst @@ -23,7 +23,7 @@ Here is what the fields mean: - ``name`` is an identifier string. A new /proc file will be created with this - ``name below /proc/sys/fs/binfmt_misc``; cannot contain slashes ``/`` for + name below ``/proc/sys/fs/binfmt_misc``; cannot contain slashes ``/`` for obvious reasons. - ``type`` is the type of recognition. Give ``M`` for magic and ``E`` for extension. @@ -83,7 +83,7 @@ Here is what the fields mean: ``F`` - fix binary The usual behaviour of binfmt_misc is to spawn the binary lazily when the misc format file is invoked. However, - this doesn``t work very well in the face of mount namespaces and + this doesn't work very well in the face of mount namespaces and changeroots, so the ``F`` mode opens the binary as soon as the emulation is installed and uses the opened image to spawn the emulator, meaning it is always available once installed, From 25942e5ecbac33918ec2f0869ca9a374dbb023f2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 31 Dec 2020 20:08:31 -0800 Subject: [PATCH 480/547] Documentation/admin-guide: kernel-parameters: hyphenate comma-separated Hyphenate "comma separated" when it is used as a compound adjective. hyphenated. Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20210101040831.4148-1-rdunlap@infradead.org Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/kernel-parameters.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c722ec19cd00..9e3cdb271d06 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1385,7 +1385,7 @@ ftrace_filter=[function-list] [FTRACE] Limit the functions traced by the function - tracer at boot up. function-list is a comma separated + tracer at boot up. function-list is a comma-separated list of functions. This list can be changed at run time by the set_ftrace_filter file in the debugfs tracing directory. @@ -1399,13 +1399,13 @@ ftrace_graph_filter=[function-list] [FTRACE] Limit the top level callers functions traced by the function graph tracer at boot up. - function-list is a comma separated list of functions + function-list is a comma-separated list of functions that can be changed at run time by the set_graph_function file in the debugfs tracing directory. ftrace_graph_notrace=[function-list] [FTRACE] Do not trace from the functions specified in - function-list. This list is a comma separated list of + function-list. This list is a comma-separated list of functions that can be changed at run time by the set_graph_notrace file in the debugfs tracing directory. @@ -2421,7 +2421,7 @@ when set. Format: - libata.force= [LIBATA] Force configurations. The format is comma + libata.force= [LIBATA] Force configurations. The format is comma- separated list of "[ID:]VAL" where ID is PORT[.DEVICE]. PORT and DEVICE are decimal numbers matching port, link or device. Basically, it matches @@ -5145,7 +5145,7 @@ stacktrace_filter=[function-list] [FTRACE] Limit the functions that the stack tracer - will trace at boot up. function-list is a comma separated + will trace at boot up. function-list is a comma-separated list of functions. This list can be changed at run time by the stack_trace_filter file in the debugfs tracing directory. Note, this enables stack tracing @@ -5348,7 +5348,7 @@ trace_event=[event-list] [FTRACE] Set and start specified trace events in order to facilitate early boot debugging. The event-list is a - comma separated list of trace events to enable. See + comma-separated list of trace events to enable. See also Documentation/trace/events.rst trace_options=[option-list] From 9d54ee78aef62c29b15ae2f58a70b1d1cd63a8f0 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 7 Jan 2021 18:26:10 +0530 Subject: [PATCH 481/547] docs: admin-guide: bootconfig: Fix feils to fails s/feils/fails/p Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20210107125610.1576368-1-unixbhaskar@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/bootconfig.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst index 9b90efcc3a35..452b7dcd7f6b 100644 --- a/Documentation/admin-guide/bootconfig.rst +++ b/Documentation/admin-guide/bootconfig.rst @@ -154,7 +154,7 @@ get the boot configuration data. Because of this "piggyback" method, there is no need to change or update the boot loader and the kernel image itself as long as the boot loader passes the correct initrd file size. If by any chance, the boot -loader passes a longer size, the kernel feils to find the bootconfig data. +loader passes a longer size, the kernel fails to find the bootconfig data. To do this operation, Linux kernel provides "bootconfig" command under tools/bootconfig, which allows admin to apply or delete the config file From bb12433bf56e76789c6b08b36c546f745a6aa6e1 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 6 Jan 2021 12:34:36 -0800 Subject: [PATCH 482/547] ARC: unbork 5.11 bootup: fix snafu in _TIF_NOTIFY_SIGNAL handling Linux 5.11.rcX was failing to boot on ARC HSDK board. Turns out we have a couple of issues, this being the first one, and I'm to blame as I didn't pay attention during review. TIF_NOTIFY_SIGNAL support requires checking multiple TIF_* bits in kernel return code path. Old code only needed to check a single bit so BBIT0 worked. New code needs to check multiple bits so AND instruction. So needs to use bit mask variant _TIF_SIGPENDING Cc: Jens Axboe Fixes: 53855e12588743ea128 ("arc: add support for TIF_NOTIFY_SIGNAL") Link: https://github.com/foss-for-synopsys-dwc-arc-processors/linux/issues/34 Signed-off-by: Vineet Gupta --- arch/arc/kernel/entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index 1f5308abf36d..1743506081da 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -307,7 +307,7 @@ resume_user_mode_begin: mov r0, sp ; pt_regs for arg to do_signal()/do_notify_resume() GET_CURR_THR_INFO_FLAGS r9 - and.f 0, r9, TIF_SIGPENDING|TIF_NOTIFY_SIGNAL + and.f 0, r9, _TIF_SIGPENDING|_TIF_NOTIFY_SIGNAL bz .Lchk_notify_resume ; Normal Trap/IRQ entry only saves Scratch (caller-saved) regs From 9e7a67dee27902fedab880b9af909bd4acd0fba9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Jan 2021 00:15:21 +0100 Subject: [PATCH 483/547] selftests: netfilter: add selftest for ipip pmtu discovery with enabled connection tracking Convert Christians bug description into a reproducer. Cc: Shuah Khan Reported-by: Christian Perle Signed-off-by: Florian Westphal Acked-by: Pablo Neira Ayuso Signed-off-by: Jakub Kicinski --- tools/testing/selftests/netfilter/Makefile | 3 +- .../selftests/netfilter/ipip-conntrack-mtu.sh | 206 ++++++++++++++++++ 2 files changed, 208 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile index a374e10ef506..3006a8e5b41a 100644 --- a/tools/testing/selftests/netfilter/Makefile +++ b/tools/testing/selftests/netfilter/Makefile @@ -4,7 +4,8 @@ TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \ conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \ nft_concat_range.sh nft_conntrack_helper.sh \ - nft_queue.sh nft_meta.sh + nft_queue.sh nft_meta.sh \ + ipip-conntrack-mtu.sh LDLIBS = -lmnl TEST_GEN_FILES = nf-queue diff --git a/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh b/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh new file mode 100755 index 000000000000..4a6f5c3b3215 --- /dev/null +++ b/tools/testing/selftests/netfilter/ipip-conntrack-mtu.sh @@ -0,0 +1,206 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# Conntrack needs to reassemble fragments in order to have complete +# packets for rule matching. Reassembly can lead to packet loss. + +# Consider the following setup: +# +--------+ +---------+ +--------+ +# |Router A|-------|Wanrouter|-------|Router B| +# | |.IPIP..| |..IPIP.| | +# +--------+ +---------+ +--------+ +# / mtu 1400 \ +# / \ +#+--------+ +--------+ +#|Client A| |Client B| +#| | | | +#+--------+ +--------+ + +# Router A and Router B use IPIP tunnel interfaces to tunnel traffic +# between Client A and Client B over WAN. Wanrouter has MTU 1400 set +# on its interfaces. + +rnd=$(mktemp -u XXXXXXXX) +rx=$(mktemp) + +r_a="ns-ra-$rnd" +r_b="ns-rb-$rnd" +r_w="ns-rw-$rnd" +c_a="ns-ca-$rnd" +c_b="ns-cb-$rnd" + +checktool (){ + if ! $1 > /dev/null 2>&1; then + echo "SKIP: Could not $2" + exit $ksft_skip + fi +} + +checktool "iptables --version" "run test without iptables" +checktool "ip -Version" "run test without ip tool" +checktool "which nc" "run test without nc (netcat)" +checktool "ip netns add ${r_a}" "create net namespace" + +for n in ${r_b} ${r_w} ${c_a} ${c_b};do + ip netns add ${n} +done + +cleanup() { + for n in ${r_a} ${r_b} ${r_w} ${c_a} ${c_b};do + ip netns del ${n} + done + rm -f ${rx} +} + +trap cleanup EXIT + +test_path() { + msg="$1" + + ip netns exec ${c_b} nc -n -w 3 -q 3 -u -l -p 5000 > ${rx} < /dev/null & + + sleep 1 + for i in 1 2 3; do + head -c1400 /dev/zero | tr "\000" "a" | ip netns exec ${c_a} nc -n -w 1 -u 192.168.20.2 5000 + done + + wait + + bytes=$(wc -c < ${rx}) + + if [ $bytes -eq 1400 ];then + echo "OK: PMTU $msg connection tracking" + else + echo "FAIL: PMTU $msg connection tracking: got $bytes, expected 1400" + exit 1 + fi +} + +# Detailed setup for Router A +# --------------------------- +# Interfaces: +# eth0: 10.2.2.1/24 +# eth1: 192.168.10.1/24 +# ipip0: No IP address, local 10.2.2.1 remote 10.4.4.1 +# Routes: +# 192.168.20.0/24 dev ipip0 (192.168.20.0/24 is subnet of Client B) +# 10.4.4.1 via 10.2.2.254 (Router B via Wanrouter) +# No iptables rules at all. + +ip link add veth0 netns ${r_a} type veth peer name veth0 netns ${r_w} +ip link add veth1 netns ${r_a} type veth peer name veth0 netns ${c_a} + +l_addr="10.2.2.1" +r_addr="10.4.4.1" +ip netns exec ${r_a} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip + +for dev in lo veth0 veth1 ipip0; do + ip -net ${r_a} link set $dev up +done + +ip -net ${r_a} addr add 10.2.2.1/24 dev veth0 +ip -net ${r_a} addr add 192.168.10.1/24 dev veth1 + +ip -net ${r_a} route add 192.168.20.0/24 dev ipip0 +ip -net ${r_a} route add 10.4.4.0/24 via 10.2.2.254 + +ip netns exec ${r_a} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null + +# Detailed setup for Router B +# --------------------------- +# Interfaces: +# eth0: 10.4.4.1/24 +# eth1: 192.168.20.1/24 +# ipip0: No IP address, local 10.4.4.1 remote 10.2.2.1 +# Routes: +# 192.168.10.0/24 dev ipip0 (192.168.10.0/24 is subnet of Client A) +# 10.2.2.1 via 10.4.4.254 (Router A via Wanrouter) +# No iptables rules at all. + +ip link add veth0 netns ${r_b} type veth peer name veth1 netns ${r_w} +ip link add veth1 netns ${r_b} type veth peer name veth0 netns ${c_b} + +l_addr="10.4.4.1" +r_addr="10.2.2.1" + +ip netns exec ${r_b} ip link add ipip0 type ipip local ${l_addr} remote ${r_addr} mode ipip || exit $ksft_skip + +for dev in lo veth0 veth1 ipip0; do + ip -net ${r_b} link set $dev up +done + +ip -net ${r_b} addr add 10.4.4.1/24 dev veth0 +ip -net ${r_b} addr add 192.168.20.1/24 dev veth1 + +ip -net ${r_b} route add 192.168.10.0/24 dev ipip0 +ip -net ${r_b} route add 10.2.2.0/24 via 10.4.4.254 +ip netns exec ${r_b} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null + +# Client A +ip -net ${c_a} addr add 192.168.10.2/24 dev veth0 +ip -net ${c_a} link set dev lo up +ip -net ${c_a} link set dev veth0 up +ip -net ${c_a} route add default via 192.168.10.1 + +# Client A +ip -net ${c_b} addr add 192.168.20.2/24 dev veth0 +ip -net ${c_b} link set dev veth0 up +ip -net ${c_b} link set dev lo up +ip -net ${c_b} route add default via 192.168.20.1 + +# Wan +ip -net ${r_w} addr add 10.2.2.254/24 dev veth0 +ip -net ${r_w} addr add 10.4.4.254/24 dev veth1 + +ip -net ${r_w} link set dev lo up +ip -net ${r_w} link set dev veth0 up mtu 1400 +ip -net ${r_w} link set dev veth1 up mtu 1400 + +ip -net ${r_a} link set dev veth0 mtu 1400 +ip -net ${r_b} link set dev veth0 mtu 1400 + +ip netns exec ${r_w} sysctl -q net.ipv4.conf.all.forwarding=1 > /dev/null + +# Path MTU discovery +# ------------------ +# Running tracepath from Client A to Client B shows PMTU discovery is working +# as expected: +# +# clienta:~# tracepath 192.168.20.2 +# 1?: [LOCALHOST] pmtu 1500 +# 1: 192.168.10.1 0.867ms +# 1: 192.168.10.1 0.302ms +# 2: 192.168.10.1 0.312ms pmtu 1480 +# 2: no reply +# 3: 192.168.10.1 0.510ms pmtu 1380 +# 3: 192.168.20.2 2.320ms reached +# Resume: pmtu 1380 hops 3 back 3 + +# ip netns exec ${c_a} traceroute --mtu 192.168.20.2 + +# Router A has learned PMTU (1400) to Router B from Wanrouter. +# Client A has learned PMTU (1400 - IPIP overhead = 1380) to Client B +# from Router A. + +#Send large UDP packet +#--------------------- +#Now we send a 1400 bytes UDP packet from Client A to Client B: + +# clienta:~# head -c1400 /dev/zero | tr "\000" "a" | nc -u 192.168.20.2 5000 +test_path "without" + +# The IPv4 stack on Client A already knows the PMTU to Client B, so the +# UDP packet is sent as two fragments (1380 + 20). Router A forwards the +# fragments between eth1 and ipip0. The fragments fit into the tunnel and +# reach their destination. + +#When sending the large UDP packet again, Router A now reassembles the +#fragments before routing the packet over ipip0. The resulting IPIP +#packet is too big (1400) for the tunnel PMTU (1380) to Router B, it is +#dropped on Router A before sending. + +ip netns exec ${r_a} iptables -A FORWARD -m conntrack --ctstate NEW +test_path "with" From 50c661670f6a3908c273503dfa206dfc7aa54c07 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Jan 2021 00:15:22 +0100 Subject: [PATCH 484/547] net: fix pmtu check in nopmtudisc mode For some reason ip_tunnel insist on setting the DF bit anyway when the inner header has the DF bit set, EVEN if the tunnel was configured with 'nopmtudisc'. This means that the script added in the previous commit cannot be made to work by adding the 'nopmtudisc' flag to the ip tunnel configuration. Doing so breaks connectivity even for the without-conntrack/netfilter scenario. When nopmtudisc is set, the tunnel will skip the mtu check, so no icmp error is sent to client. Then, because inner header has DF set, the outer header gets added with DF bit set as well. IP stack then sends an error to itself because the packet exceeds the device MTU. Fixes: 23a3647bc4f93 ("ip_tunnels: Use skb-len to PMTU check.") Cc: Stefano Brivio Signed-off-by: Florian Westphal Acked-by: Pablo Neira Ayuso Signed-off-by: Jakub Kicinski --- net/ipv4/ip_tunnel.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index ee65c9225178..64594aa755f0 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -759,8 +759,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } - if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph, - 0, 0, false)) { + df = tnl_params->frag_off; + if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) + df |= (inner_iph->frag_off & htons(IP_DF)); + + if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { ip_rt_put(rt); goto tx_error; } @@ -788,10 +791,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ttl = ip4_dst_hoplimit(&rt->dst); } - df = tnl_params->frag_off; - if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) - df |= (inner_iph->frag_off&htons(IP_DF)); - max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); if (max_headroom > dev->needed_headroom) From bb4cc1a18856a73f0ff5137df0c2a31f4c50f6cf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 6 Jan 2021 00:15:23 +0100 Subject: [PATCH 485/547] net: ip: always refragment ip defragmented packets Conntrack reassembly records the largest fragment size seen in IPCB. However, when this gets forwarded/transmitted, fragmentation will only be forced if one of the fragmented packets had the DF bit set. In that case, a flag in IPCB will force fragmentation even if the MTU is large enough. This should work fine, but this breaks with ip tunnels. Consider client that sends a UDP datagram of size X to another host. The client fragments the datagram, so two packets, of size y and z, are sent. DF bit is not set on any of these packets. Middlebox netfilter reassembles those packets back to single size-X packet, before routing decision. packet-size-vs-mtu checks in ip_forward are irrelevant, because DF bit isn't set. At output time, ip refragmentation is skipped as well because x is still smaller than the mtu of the output device. If ttransmit device is an ip tunnel, the packet size increases to x+overhead. Also, tunnel might be configured to force DF bit on outer header. In this case, packet will be dropped (exceeds MTU) and an ICMP error is generated back to sender. But sender already respects the announced MTU, all the packets that it sent did fit the announced mtu. Force refragmentation as per original sizes unconditionally so ip tunnel will encapsulate the fragments instead. The only other solution I see is to place ip refragmentation in the ip_tunnel code to handle this case. Fixes: d6b915e29f4ad ("ip_fragment: don't forward defragmented DF packet") Reported-by: Christian Perle Signed-off-by: Florian Westphal Acked-by: Pablo Neira Ayuso Signed-off-by: Jakub Kicinski --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 89fff5f59eea..2ed0b01f72f0 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -302,7 +302,7 @@ static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff * if (skb_is_gso(skb)) return ip_finish_output_gso(net, sk, skb, mtu); - if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU)) + if (skb->len > mtu || IPCB(skb)->frag_max_size) return ip_fragment(net, sk, skb, mtu, ip_finish_output2); return ip_finish_output2(net, sk, skb); From 2aa078932ff6c66bf10cc5b3144440dbfa7d813d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Dec 2020 16:31:36 -0800 Subject: [PATCH 486/547] KVM: x86/mmu: Use -1 to flag an undefined spte in get_mmio_spte() Return -1 from the get_walk() helpers if the shadow walk doesn't fill at least one spte, which can theoretically happen if the walk hits a not-present PDPTR. Returning the root level in such a case will cause get_mmio_spte() to return garbage (uninitialized stack data). In practice, such a scenario should be impossible as KVM shouldn't get a reserved-bit page fault with a not-present PDPTR. Note, using mmu->root_level in get_walk() is wrong for other reasons, too, but that's now a moot point. Fixes: 95fb5b0258b7 ("kvm: x86/mmu: Support MMIO in the TDP MMU") Cc: Ben Gardon Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20201218003139.2167891-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 7 ++++++- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7a6ae9e90bd7..a48cd12c01d7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3488,7 +3488,7 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) { struct kvm_shadow_walk_iterator iterator; - int leaf = vcpu->arch.mmu->root_level; + int leaf = -1; u64 spte; @@ -3532,6 +3532,11 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) else leaf = get_walk(vcpu, addr, sptes); + if (unlikely(leaf < 0)) { + *sptep = 0ull; + return reserved; + } + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; for (level = root; level >= leaf; level--) { diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 84c8f06bec26..50cec7a15ddb 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1152,8 +1152,8 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) { struct tdp_iter iter; struct kvm_mmu *mmu = vcpu->arch.mmu; - int leaf = vcpu->arch.mmu->shadow_root_level; gfn_t gfn = addr >> PAGE_SHIFT; + int leaf = -1; tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; From 39b4d43e6003cee51cd119596d3c33d0449eb44c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Dec 2020 16:31:37 -0800 Subject: [PATCH 487/547] KVM: x86/mmu: Get root level from walkers when retrieving MMIO SPTE Get the so called "root" level from the low level shadow page table walkers instead of manually attempting to calculate it higher up the stack, e.g. in get_mmio_spte(). When KVM is using PAE shadow paging, the starting level of the walk, from the callers perspective, is not the CR3 root but rather the PDPTR "root". Checking for reserved bits from the CR3 root causes get_mmio_spte() to consume uninitialized stack data due to indexing into sptes[] for a level that was not filled by get_walk(). This can result in false positives and/or negatives depending on what garbage happens to be on the stack. Opportunistically nuke a few extra newlines. Fixes: 95fb5b0258b7 ("kvm: x86/mmu: Support MMIO in the TDP MMU") Reported-by: Richard Herbert Cc: Ben Gardon Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20201218003139.2167891-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 ++++++--------- arch/x86/kvm/mmu/tdp_mmu.c | 5 ++++- arch/x86/kvm/mmu/tdp_mmu.h | 4 +++- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a48cd12c01d7..52f36c879086 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3485,16 +3485,16 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. */ -static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) +static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { struct kvm_shadow_walk_iterator iterator; int leaf = -1; u64 spte; - walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr); + for (shadow_walk_init(&iterator, vcpu, addr), + *root_level = iterator.level; shadow_walk_okay(&iterator); __shadow_walk_next(&iterator, spte)) { leaf = iterator.level; @@ -3504,7 +3504,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) if (!is_shadow_present_pte(spte)) break; - } walk_shadow_page_lockless_end(vcpu); @@ -3517,9 +3516,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { u64 sptes[PT64_ROOT_MAX_LEVEL]; struct rsvd_bits_validate *rsvd_check; - int root = vcpu->arch.mmu->shadow_root_level; - int leaf; - int level; + int root, leaf, level; bool reserved = false; if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) { @@ -3528,9 +3525,9 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) } if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa)) - leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes); + leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else - leaf = get_walk(vcpu, addr, sptes); + leaf = get_walk(vcpu, addr, sptes, &root); if (unlikely(leaf < 0)) { *sptep = 0ull; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 50cec7a15ddb..a4f9447f8327 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1148,13 +1148,16 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. */ -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) { struct tdp_iter iter; struct kvm_mmu *mmu = vcpu->arch.mmu; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; + *root_level = vcpu->arch.mmu->shadow_root_level; + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf - 1] = iter.old_spte; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 556e065503f6..cbbdbadd1526 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -44,5 +44,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes); +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level); + #endif /* __KVM_X86_MMU_TDP_MMU_H */ From dde81f9477d018a96fba991c5928c6ab8cc109f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Dec 2020 16:31:38 -0800 Subject: [PATCH 488/547] KVM: x86/mmu: Use raw level to index into MMIO walks' sptes array Bump the size of the sptes array by one and use the raw level of the SPTE to index into the sptes array. Using the SPTE level directly improves readability by eliminating the need to reason out why the level is being adjusted when indexing the array. The array is on the stack and is not explicitly initialized; bumping its size is nothing more than a superficial adjustment to the stack frame. Signed-off-by: Sean Christopherson Message-Id: <20201218003139.2167891-4-seanjc@google.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 +++++++-------- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 52f36c879086..4798a4472066 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3500,7 +3500,7 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); - sptes[leaf - 1] = spte; + sptes[leaf] = spte; if (!is_shadow_present_pte(spte)) break; @@ -3514,7 +3514,7 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level /* return true if reserved bit is detected on spte. */ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { - u64 sptes[PT64_ROOT_MAX_LEVEL]; + u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; struct rsvd_bits_validate *rsvd_check; int root, leaf, level; bool reserved = false; @@ -3537,16 +3537,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) rsvd_check = &vcpu->arch.mmu->shadow_zero_check; for (level = root; level >= leaf; level--) { - if (!is_shadow_present_pte(sptes[level - 1])) + if (!is_shadow_present_pte(sptes[level])) break; /* * Use a bitwise-OR instead of a logical-OR to aggregate the * reserved bit and EPT's invalid memtype/XWR checks to avoid * adding a Jcc in the loop. */ - reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level - 1]) | - __is_rsvd_bits_set(rsvd_check, sptes[level - 1], - level); + reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) | + __is_rsvd_bits_set(rsvd_check, sptes[level], level); } if (reserved) { @@ -3554,10 +3553,10 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) __func__, addr); for (level = root; level >= leaf; level--) pr_err("------ spte 0x%llx level %d.\n", - sptes[level - 1], level); + sptes[level], level); } - *sptep = sptes[leaf - 1]; + *sptep = sptes[leaf]; return reserved; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a4f9447f8327..efef571806ad 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1160,7 +1160,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; - sptes[leaf - 1] = iter.old_spte; + sptes[leaf] = iter.old_spte; } return leaf; From 9aa418792f5f11ef5d6f72265e1f8ae07efd5784 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 17 Dec 2020 16:31:39 -0800 Subject: [PATCH 489/547] KVM: x86/mmu: Optimize not-present/MMIO SPTE check in get_mmio_spte() Check only the terminal leaf for a "!PRESENT || MMIO" SPTE when looking for reserved bits on valid, non-MMIO SPTEs. The get_walk() helpers terminate their walks if a not-present or MMIO SPTE is encountered, i.e. the non-terminal SPTEs have already been verified to be regular SPTEs. This eliminates an extra check-and-branch in a relatively hot loop. Signed-off-by: Sean Christopherson Message-Id: <20201218003139.2167891-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 4798a4472066..769855f5f0a1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3511,7 +3511,7 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level return leaf; } -/* return true if reserved bit is detected on spte. */ +/* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; @@ -3534,11 +3534,20 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) return reserved; } + *sptep = sptes[leaf]; + + /* + * Skip reserved bits checks on the terminal leaf if it's not a valid + * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by + * design, always have reserved bits set. The purpose of the checks is + * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs. + */ + if (!is_shadow_present_pte(sptes[leaf])) + leaf++; + rsvd_check = &vcpu->arch.mmu->shadow_zero_check; - for (level = root; level >= leaf; level--) { - if (!is_shadow_present_pte(sptes[level])) - break; + for (level = root; level >= leaf; level--) /* * Use a bitwise-OR instead of a logical-OR to aggregate the * reserved bit and EPT's invalid memtype/XWR checks to avoid @@ -3546,7 +3555,6 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) */ reserved |= __is_bad_mt_xwr(rsvd_check, sptes[level]) | __is_rsvd_bits_set(rsvd_check, sptes[level], level); - } if (reserved) { pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", @@ -3556,8 +3564,6 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) sptes[level], level); } - *sptep = sptes[leaf]; - return reserved; } From f65cf84ee769767536dc367acc9568ddb6e4c9f4 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 18 Dec 2020 23:37:11 -0700 Subject: [PATCH 490/547] KVM: SVM: Add register operand to vmsave call in sev_es_vcpu_load When using LLVM's integrated assembler (LLVM_IAS=1) while building x86_64_defconfig + CONFIG_KVM=y + CONFIG_KVM_AMD=y, the following build error occurs: $ make LLVM=1 LLVM_IAS=1 arch/x86/kvm/svm/sev.o arch/x86/kvm/svm/sev.c:2004:15: error: too few operands for instruction asm volatile(__ex("vmsave") : : "a" (__sme_page_pa(sd->save_area)) : "memory"); ^ arch/x86/kvm/svm/sev.c:28:17: note: expanded from macro '__ex' #define __ex(x) __kvm_handle_fault_on_reboot(x) ^ ./arch/x86/include/asm/kvm_host.h:1646:10: note: expanded from macro '__kvm_handle_fault_on_reboot' "666: \n\t" \ ^ :2:2: note: instantiated into assembly here vmsave ^ 1 error generated. This happens because LLVM currently does not support calling vmsave without the fixed register operand (%rax for 64-bit and %eax for 32-bit). This will be fixed in LLVM 12 but the kernel currently supports LLVM 10.0.1 and newer so this needs to be handled. Add the proper register using the _ASM_AX macro, which matches the vmsave call in vmenter.S. Fixes: 861377730aa9 ("KVM: SVM: Provide support for SEV-ES vCPU loading") Link: https://reviews.llvm.org/D93524 Link: https://github.com/ClangBuiltLinux/linux/issues/1216 Signed-off-by: Nathan Chancellor Message-Id: <20201219063711.3526947-1-natechancellor@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 9858d5ae9ddd..563ced07b0b8 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2001,7 +2001,7 @@ void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu) * of which one step is to perform a VMLOAD. Since hardware does not * perform a VMSAVE on VMRUN, the host savearea must be updated. */ - asm volatile(__ex("vmsave") : : "a" (__sme_page_pa(sd->save_area)) : "memory"); + asm volatile(__ex("vmsave %0") : : "a" (__sme_page_pa(sd->save_area)) : "memory"); /* * Certain MSRs are restored on VMEXIT, only save ones that aren't From 52782d5b63725a6c4bf642557c83507430064110 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 20 Dec 2020 21:03:39 +0100 Subject: [PATCH 491/547] KVM/SVM: Remove leftover __svm_vcpu_run prototype from svm.c Commit 16809ecdc1e8a moved __svm_vcpu_run the prototype to svm.h, but forgot to remove the original from svm.c. Fixes: 16809ecdc1e8a ("KVM: SVM: Provide an updated VMRUN invocation for SEV-ES guests") Cc: Tom Lendacky Cc: Paolo Bonzini Signed-off-by: Uros Bizjak Message-Id: <20201220200339.65115-1-ubizjak@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cce0143a6f80..6824d611dc5d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3677,8 +3677,6 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } -void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); - static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_svm *svm) { From e42ac777d661e878c3b9bac56df11e226cab3010 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 18 Dec 2020 15:17:32 +0100 Subject: [PATCH 492/547] KVM: selftests: Factor out guest mode code demand_paging_test, dirty_log_test, and dirty_log_perf_test have redundant guest mode code. Factor it out. Also, while adding a new include, remove the ones we don't need. Reviewed-by: Ben Gardon Reviewed-by: Peter Xu Signed-off-by: Andrew Jones Message-Id: <20201218141734.54359-2-drjones@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 2 +- .../selftests/kvm/demand_paging_test.c | 105 ++++----------- .../selftests/kvm/dirty_log_perf_test.c | 121 +++++------------ tools/testing/selftests/kvm/dirty_log_test.c | 125 ++++++------------ .../selftests/kvm/include/guest_modes.h | 21 +++ tools/testing/selftests/kvm/lib/guest_modes.c | 70 ++++++++++ 6 files changed, 188 insertions(+), 256 deletions(-) create mode 100644 tools/testing/selftests/kvm/include/guest_modes.h create mode 100644 tools/testing/selftests/kvm/lib/guest_modes.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index c7ca4faba272..a7286a08c3ae 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -33,7 +33,7 @@ ifeq ($(ARCH),s390) UNAME_M := s390x endif -LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c +LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 3d96a7bfaff3..946161a9ce2d 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -7,23 +7,20 @@ * Copyright (C) 2019, Google, Inc. */ -#define _GNU_SOURCE /* for program_invocation_name */ +#define _GNU_SOURCE /* for program_invocation_name and pipe2 */ #include #include -#include -#include -#include #include #include #include -#include -#include #include +#include -#include "perf_test_util.h" -#include "processor.h" +#include "kvm_util.h" #include "test_util.h" +#include "perf_test_util.h" +#include "guest_modes.h" #ifdef __NR_userfaultfd @@ -248,9 +245,14 @@ static int setup_demand_paging(struct kvm_vm *vm, return 0; } -static void run_test(enum vm_guest_mode mode, bool use_uffd, - useconds_t uffd_delay) +struct test_params { + bool use_uffd; + useconds_t uffd_delay; +}; + +static void run_test(enum vm_guest_mode mode, void *arg) { + struct test_params *p = arg; pthread_t *vcpu_threads; pthread_t *uffd_handler_threads = NULL; struct uffd_handler_args *uffd_args = NULL; @@ -275,7 +277,7 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, add_vcpus(vm, nr_vcpus, guest_percpu_mem_size); - if (use_uffd) { + if (p->use_uffd) { uffd_handler_threads = malloc(nr_vcpus * sizeof(*uffd_handler_threads)); TEST_ASSERT(uffd_handler_threads, "Memory allocation failed"); @@ -308,7 +310,7 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, r = setup_demand_paging(vm, &uffd_handler_threads[vcpu_id], pipefds[vcpu_id * 2], - uffd_delay, &uffd_args[vcpu_id], + p->uffd_delay, &uffd_args[vcpu_id], vcpu_hva, guest_percpu_mem_size); if (r < 0) exit(-r); @@ -339,7 +341,7 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, pr_info("All vCPU threads joined\n"); - if (use_uffd) { + if (p->use_uffd) { char c; /* Tell the user fault fd handler threads to quit */ @@ -362,38 +364,19 @@ static void run_test(enum vm_guest_mode mode, bool use_uffd, free(guest_data_prototype); free(vcpu_threads); - if (use_uffd) { + if (p->use_uffd) { free(uffd_handler_threads); free(uffd_args); free(pipefds); } } -struct guest_mode { - bool supported; - bool enabled; -}; -static struct guest_mode guest_modes[NUM_VM_MODES]; - -#define guest_mode_init(mode, supported, enabled) ({ \ - guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ -}) - static void help(char *name) { - int i; - puts(""); printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n" " [-b memory] [-v vcpus]\n", name); - printf(" -m: specify the guest mode ID to test\n" - " (default: test all supported modes)\n" - " This option may be used multiple times.\n" - " Guest mode IDs:\n"); - for (i = 0; i < NUM_VM_MODES; ++i) { - printf(" %d: %s%s\n", i, vm_guest_mode_string(i), - guest_modes[i].supported ? " (supported)" : ""); - } + guest_modes_help(); printf(" -u: use User Fault FD to handle vCPU page\n" " faults.\n"); printf(" -d: add a delay in usec to the User Fault\n" @@ -410,53 +393,22 @@ static void help(char *name) int main(int argc, char *argv[]) { int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); - bool mode_selected = false; - unsigned int mode; - int opt, i; - bool use_uffd = false; - useconds_t uffd_delay = 0; + struct test_params p = {}; + int opt; -#ifdef __x86_64__ - guest_mode_init(VM_MODE_PXXV48_4K, true, true); -#endif -#ifdef __aarch64__ - guest_mode_init(VM_MODE_P40V48_4K, true, true); - guest_mode_init(VM_MODE_P40V48_64K, true, true); - { - unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); - - if (limit >= 52) - guest_mode_init(VM_MODE_P52V48_64K, true, true); - if (limit >= 48) { - guest_mode_init(VM_MODE_P48V48_4K, true, true); - guest_mode_init(VM_MODE_P48V48_64K, true, true); - } - } -#endif -#ifdef __s390x__ - guest_mode_init(VM_MODE_P40V48_4K, true, true); -#endif + guest_modes_append_default(); while ((opt = getopt(argc, argv, "hm:ud:b:v:")) != -1) { switch (opt) { case 'm': - if (!mode_selected) { - for (i = 0; i < NUM_VM_MODES; ++i) - guest_modes[i].enabled = false; - mode_selected = true; - } - mode = strtoul(optarg, NULL, 10); - TEST_ASSERT(mode < NUM_VM_MODES, - "Guest mode ID %d too big", mode); - guest_modes[mode].enabled = true; + guest_modes_cmdline(optarg); break; case 'u': - use_uffd = true; + p.use_uffd = true; break; case 'd': - uffd_delay = strtoul(optarg, NULL, 0); - TEST_ASSERT(uffd_delay >= 0, - "A negative UFFD delay is not supported."); + p.uffd_delay = strtoul(optarg, NULL, 0); + TEST_ASSERT(p.uffd_delay >= 0, "A negative UFFD delay is not supported."); break; case 'b': guest_percpu_mem_size = parse_size(optarg); @@ -473,14 +425,7 @@ int main(int argc, char *argv[]) } } - for (i = 0; i < NUM_VM_MODES; ++i) { - if (!guest_modes[i].enabled) - continue; - TEST_ASSERT(guest_modes[i].supported, - "Guest mode ID %d (%s) not supported.", - i, vm_guest_mode_string(i)); - run_test(i, use_uffd, uffd_delay); - } + for_each_guest_mode(run_test, &p); return 0; } diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 9c6a7be31e03..506741eb5d7f 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -12,16 +12,14 @@ #include #include -#include #include #include #include -#include #include "kvm_util.h" -#include "perf_test_util.h" -#include "processor.h" #include "test_util.h" +#include "perf_test_util.h" +#include "guest_modes.h" /* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/ #define TEST_HOST_LOOP_N 2UL @@ -89,9 +87,15 @@ static void *vcpu_worker(void *data) return NULL; } -static void run_test(enum vm_guest_mode mode, unsigned long iterations, - uint64_t phys_offset, int wr_fract) +struct test_params { + unsigned long iterations; + uint64_t phys_offset; + int wr_fract; +}; + +static void run_test(enum vm_guest_mode mode, void *arg) { + struct test_params *p = arg; pthread_t *vcpu_threads; struct kvm_vm *vm; unsigned long *bmap; @@ -108,7 +112,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size); - perf_test_args.wr_fract = wr_fract; + perf_test_args.wr_fract = p->wr_fract; guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm); guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); @@ -156,7 +160,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", ts_diff.tv_sec, ts_diff.tv_nsec); - while (iteration < iterations) { + while (iteration < p->iterations) { /* * Incrementing the iteration number will start the vCPUs * dirtying memory again. @@ -210,15 +214,15 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, pr_info("Disabling dirty logging time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); - avg = timespec_div(get_dirty_log_total, iterations); + avg = timespec_div(get_dirty_log_total, p->iterations); pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", - iterations, get_dirty_log_total.tv_sec, + p->iterations, get_dirty_log_total.tv_sec, get_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); if (dirty_log_manual_caps) { - avg = timespec_div(clear_dirty_log_total, iterations); + avg = timespec_div(clear_dirty_log_total, p->iterations); pr_info("Clear dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n", - iterations, clear_dirty_log_total.tv_sec, + p->iterations, clear_dirty_log_total.tv_sec, clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); } @@ -228,20 +232,8 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, kvm_vm_free(vm); } -struct guest_mode { - bool supported; - bool enabled; -}; -static struct guest_mode guest_modes[NUM_VM_MODES]; - -#define guest_mode_init(mode, supported, enabled) ({ \ - guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ -}) - static void help(char *name) { - int i; - puts(""); printf("usage: %s [-h] [-i iterations] [-p offset] " "[-m mode] [-b vcpu bytes] [-v vcpus]\n", name); @@ -250,14 +242,7 @@ static void help(char *name) TEST_HOST_LOOP_N); printf(" -p: specify guest physical test memory offset\n" " Warning: a low offset can conflict with the loaded test code.\n"); - printf(" -m: specify the guest mode ID to test " - "(default: test all supported modes)\n" - " This option may be used multiple times.\n" - " Guest mode IDs:\n"); - for (i = 0; i < NUM_VM_MODES; ++i) { - printf(" %d: %s%s\n", i, vm_guest_mode_string(i), - guest_modes[i].supported ? " (supported)" : ""); - } + guest_modes_help(); printf(" -b: specify the size of the memory region which should be\n" " dirtied by each vCPU. e.g. 10M or 3G.\n" " (default: 1G)\n"); @@ -272,74 +257,43 @@ static void help(char *name) int main(int argc, char *argv[]) { - unsigned long iterations = TEST_HOST_LOOP_N; - bool mode_selected = false; - uint64_t phys_offset = 0; - unsigned int mode; - int opt, i; - int wr_fract = 1; + int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + struct test_params p = { + .iterations = TEST_HOST_LOOP_N, + .wr_fract = 1, + }; + int opt; dirty_log_manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | KVM_DIRTY_LOG_INITIALLY_SET); -#ifdef __x86_64__ - guest_mode_init(VM_MODE_PXXV48_4K, true, true); -#endif -#ifdef __aarch64__ - guest_mode_init(VM_MODE_P40V48_4K, true, true); - guest_mode_init(VM_MODE_P40V48_64K, true, true); - - { - unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); - - if (limit >= 52) - guest_mode_init(VM_MODE_P52V48_64K, true, true); - if (limit >= 48) { - guest_mode_init(VM_MODE_P48V48_4K, true, true); - guest_mode_init(VM_MODE_P48V48_64K, true, true); - } - } -#endif -#ifdef __s390x__ - guest_mode_init(VM_MODE_P40V48_4K, true, true); -#endif + guest_modes_append_default(); while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:")) != -1) { switch (opt) { case 'i': - iterations = strtol(optarg, NULL, 10); + p.iterations = strtol(optarg, NULL, 10); break; case 'p': - phys_offset = strtoull(optarg, NULL, 0); + p.phys_offset = strtoull(optarg, NULL, 0); break; case 'm': - if (!mode_selected) { - for (i = 0; i < NUM_VM_MODES; ++i) - guest_modes[i].enabled = false; - mode_selected = true; - } - mode = strtoul(optarg, NULL, 10); - TEST_ASSERT(mode < NUM_VM_MODES, - "Guest mode ID %d too big", mode); - guest_modes[mode].enabled = true; + guest_modes_cmdline(optarg); break; case 'b': guest_percpu_mem_size = parse_size(optarg); break; case 'f': - wr_fract = atoi(optarg); - TEST_ASSERT(wr_fract >= 1, + p.wr_fract = atoi(optarg); + TEST_ASSERT(p.wr_fract >= 1, "Write fraction cannot be less than one"); break; case 'v': nr_vcpus = atoi(optarg); - TEST_ASSERT(nr_vcpus > 0, - "Must have a positive number of vCPUs"); - TEST_ASSERT(nr_vcpus <= MAX_VCPUS, - "This test does not currently support\n" - "more than %d vCPUs.", MAX_VCPUS); + TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus, + "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; case 'h': default: @@ -348,18 +302,11 @@ int main(int argc, char *argv[]) } } - TEST_ASSERT(iterations >= 2, "The test should have at least two iterations"); + TEST_ASSERT(p.iterations >= 2, "The test should have at least two iterations"); - pr_info("Test iterations: %"PRIu64"\n", iterations); + pr_info("Test iterations: %"PRIu64"\n", p.iterations); - for (i = 0; i < NUM_VM_MODES; ++i) { - if (!guest_modes[i].enabled) - continue; - TEST_ASSERT(guest_modes[i].supported, - "Guest mode ID %d (%s) not supported.", - i, vm_guest_mode_string(i)); - run_test(i, iterations, phys_offset, wr_fract); - } + for_each_guest_mode(run_test, &p); return 0; } diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index 471baecb7772..bb2752d78fe3 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -9,8 +9,6 @@ #include #include -#include -#include #include #include #include @@ -20,8 +18,9 @@ #include #include -#include "test_util.h" #include "kvm_util.h" +#include "test_util.h" +#include "guest_modes.h" #include "processor.h" #define VCPU_ID 1 @@ -673,9 +672,15 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid, #define DIRTY_MEM_BITS 30 /* 1G */ #define PAGE_SHIFT_4K 12 -static void run_test(enum vm_guest_mode mode, unsigned long iterations, - unsigned long interval, uint64_t phys_offset) +struct test_params { + unsigned long iterations; + unsigned long interval; + uint64_t phys_offset; +}; + +static void run_test(enum vm_guest_mode mode, void *arg) { + struct test_params *p = arg; struct kvm_vm *vm; unsigned long *bmap; @@ -709,12 +714,12 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, host_page_size = getpagesize(); host_num_pages = vm_num_host_pages(mode, guest_num_pages); - if (!phys_offset) { + if (!p->phys_offset) { guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * guest_page_size; guest_test_phys_mem &= ~(host_page_size - 1); } else { - guest_test_phys_mem = phys_offset; + guest_test_phys_mem = p->phys_offset; } #ifdef __s390x__ @@ -758,9 +763,9 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, pthread_create(&vcpu_thread, NULL, vcpu_worker, vm); - while (iteration < iterations) { + while (iteration < p->iterations) { /* Give the vcpu thread some time to dirty some pages */ - usleep(interval * 1000); + usleep(p->interval * 1000); log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX, bmap, host_num_pages); vm_dirty_log_verify(mode, bmap); @@ -783,20 +788,8 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations, kvm_vm_free(vm); } -struct guest_mode { - bool supported; - bool enabled; -}; -static struct guest_mode guest_modes[NUM_VM_MODES]; - -#define guest_mode_init(mode, supported, enabled) ({ \ - guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ -}) - static void help(char *name) { - int i; - puts(""); printf("usage: %s [-h] [-i iterations] [-I interval] " "[-p offset] [-m mode]\n", name); @@ -813,51 +806,23 @@ static void help(char *name) printf(" -M: specify the host logging mode " "(default: run all log modes). Supported modes: \n\t"); log_modes_dump(); - printf(" -m: specify the guest mode ID to test " - "(default: test all supported modes)\n" - " This option may be used multiple times.\n" - " Guest mode IDs:\n"); - for (i = 0; i < NUM_VM_MODES; ++i) { - printf(" %d: %s%s\n", i, vm_guest_mode_string(i), - guest_modes[i].supported ? " (supported)" : ""); - } + guest_modes_help(); puts(""); exit(0); } int main(int argc, char *argv[]) { - unsigned long iterations = TEST_HOST_LOOP_N; - unsigned long interval = TEST_HOST_LOOP_INTERVAL; - bool mode_selected = false; - uint64_t phys_offset = 0; - unsigned int mode; - int opt, i, j; + struct test_params p = { + .iterations = TEST_HOST_LOOP_N, + .interval = TEST_HOST_LOOP_INTERVAL, + }; + int opt, i; sem_init(&dirty_ring_vcpu_stop, 0, 0); sem_init(&dirty_ring_vcpu_cont, 0, 0); -#ifdef __x86_64__ - guest_mode_init(VM_MODE_PXXV48_4K, true, true); -#endif -#ifdef __aarch64__ - guest_mode_init(VM_MODE_P40V48_4K, true, true); - guest_mode_init(VM_MODE_P40V48_64K, true, true); - - { - unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); - - if (limit >= 52) - guest_mode_init(VM_MODE_P52V48_64K, true, true); - if (limit >= 48) { - guest_mode_init(VM_MODE_P48V48_4K, true, true); - guest_mode_init(VM_MODE_P48V48_64K, true, true); - } - } -#endif -#ifdef __s390x__ - guest_mode_init(VM_MODE_P40V48_4K, true, true); -#endif + guest_modes_append_default(); while ((opt = getopt(argc, argv, "c:hi:I:p:m:M:")) != -1) { switch (opt) { @@ -865,24 +830,16 @@ int main(int argc, char *argv[]) test_dirty_ring_count = strtol(optarg, NULL, 10); break; case 'i': - iterations = strtol(optarg, NULL, 10); + p.iterations = strtol(optarg, NULL, 10); break; case 'I': - interval = strtol(optarg, NULL, 10); + p.interval = strtol(optarg, NULL, 10); break; case 'p': - phys_offset = strtoull(optarg, NULL, 0); + p.phys_offset = strtoull(optarg, NULL, 0); break; case 'm': - if (!mode_selected) { - for (i = 0; i < NUM_VM_MODES; ++i) - guest_modes[i].enabled = false; - mode_selected = true; - } - mode = strtoul(optarg, NULL, 10); - TEST_ASSERT(mode < NUM_VM_MODES, - "Guest mode ID %d too big", mode); - guest_modes[mode].enabled = true; + guest_modes_cmdline(optarg); break; case 'M': if (!strcmp(optarg, "all")) { @@ -911,32 +868,24 @@ int main(int argc, char *argv[]) } } - TEST_ASSERT(iterations > 2, "Iterations must be greater than two"); - TEST_ASSERT(interval > 0, "Interval must be greater than zero"); + TEST_ASSERT(p.iterations > 2, "Iterations must be greater than two"); + TEST_ASSERT(p.interval > 0, "Interval must be greater than zero"); pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", - iterations, interval); + p.iterations, p.interval); srandom(time(0)); - for (i = 0; i < NUM_VM_MODES; ++i) { - if (!guest_modes[i].enabled) - continue; - TEST_ASSERT(guest_modes[i].supported, - "Guest mode ID %d (%s) not supported.", - i, vm_guest_mode_string(i)); - if (host_log_mode_option == LOG_MODE_ALL) { - /* Run each log mode */ - for (j = 0; j < LOG_MODE_NUM; j++) { - pr_info("Testing Log Mode '%s'\n", - log_modes[j].name); - host_log_mode = j; - run_test(i, iterations, interval, phys_offset); - } - } else { - host_log_mode = host_log_mode_option; - run_test(i, iterations, interval, phys_offset); + if (host_log_mode_option == LOG_MODE_ALL) { + /* Run each log mode */ + for (i = 0; i < LOG_MODE_NUM; i++) { + pr_info("Testing Log Mode '%s'\n", log_modes[i].name); + host_log_mode = i; + for_each_guest_mode(run_test, &p); } + } else { + host_log_mode = host_log_mode_option; + for_each_guest_mode(run_test, &p); } return 0; diff --git a/tools/testing/selftests/kvm/include/guest_modes.h b/tools/testing/selftests/kvm/include/guest_modes.h new file mode 100644 index 000000000000..b691df33e64e --- /dev/null +++ b/tools/testing/selftests/kvm/include/guest_modes.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020, Red Hat, Inc. + */ +#include "kvm_util.h" + +struct guest_mode { + bool supported; + bool enabled; +}; + +extern struct guest_mode guest_modes[NUM_VM_MODES]; + +#define guest_mode_append(mode, supported, enabled) ({ \ + guest_modes[mode] = (struct guest_mode){ supported, enabled }; \ +}) + +void guest_modes_append_default(void); +void for_each_guest_mode(void (*func)(enum vm_guest_mode, void *), void *arg); +void guest_modes_help(void); +void guest_modes_cmdline(const char *arg); diff --git a/tools/testing/selftests/kvm/lib/guest_modes.c b/tools/testing/selftests/kvm/lib/guest_modes.c new file mode 100644 index 000000000000..25bff307c71f --- /dev/null +++ b/tools/testing/selftests/kvm/lib/guest_modes.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020, Red Hat, Inc. + */ +#include "guest_modes.h" + +struct guest_mode guest_modes[NUM_VM_MODES]; + +void guest_modes_append_default(void) +{ + guest_mode_append(VM_MODE_DEFAULT, true, true); + +#ifdef __aarch64__ + guest_mode_append(VM_MODE_P40V48_64K, true, true); + { + unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); + if (limit >= 52) + guest_mode_append(VM_MODE_P52V48_64K, true, true); + if (limit >= 48) { + guest_mode_append(VM_MODE_P48V48_4K, true, true); + guest_mode_append(VM_MODE_P48V48_64K, true, true); + } + } +#endif +} + +void for_each_guest_mode(void (*func)(enum vm_guest_mode, void *), void *arg) +{ + int i; + + for (i = 0; i < NUM_VM_MODES; ++i) { + if (!guest_modes[i].enabled) + continue; + TEST_ASSERT(guest_modes[i].supported, + "Guest mode ID %d (%s) not supported.", + i, vm_guest_mode_string(i)); + func(i, arg); + } +} + +void guest_modes_help(void) +{ + int i; + + printf(" -m: specify the guest mode ID to test\n" + " (default: test all supported modes)\n" + " This option may be used multiple times.\n" + " Guest mode IDs:\n"); + for (i = 0; i < NUM_VM_MODES; ++i) { + printf(" %d: %s%s\n", i, vm_guest_mode_string(i), + guest_modes[i].supported ? " (supported)" : ""); + } +} + +void guest_modes_cmdline(const char *arg) +{ + static bool mode_selected; + unsigned int mode; + int i; + + if (!mode_selected) { + for (i = 0; i < NUM_VM_MODES; ++i) + guest_modes[i].enabled = false; + mode_selected = true; + } + + mode = strtoul(optarg, NULL, 10); + TEST_ASSERT(mode < NUM_VM_MODES, "Guest mode ID %d too big", mode); + guest_modes[mode].enabled = true; +} From 1133e17ea7c9929ff7b90e81d8926f9e870748e9 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 18 Dec 2020 15:17:33 +0100 Subject: [PATCH 493/547] KVM: selftests: Use vm_create_with_vcpus in create_vm Reviewed-by: Ben Gardon Signed-off-by: Andrew Jones Message-Id: <20201218141734.54359-3-drjones@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/demand_paging_test.c | 2 +- .../selftests/kvm/dirty_log_perf_test.c | 2 - .../testing/selftests/kvm/include/kvm_util.h | 8 ++++ .../selftests/kvm/include/perf_test_util.h | 47 +++++-------------- tools/testing/selftests/kvm/lib/kvm_util.c | 9 +--- 5 files changed, 21 insertions(+), 47 deletions(-) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 946161a9ce2d..b0c41de32e9b 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -7,7 +7,7 @@ * Copyright (C) 2019, Google, Inc. */ -#define _GNU_SOURCE /* for program_invocation_name and pipe2 */ +#define _GNU_SOURCE /* for pipe2 */ #include #include diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 506741eb5d7f..36bea75a8d6f 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -8,8 +8,6 @@ * Copyright (C) 2020, Google, Inc. */ -#define _GNU_SOURCE /* for program_invocation_name */ - #include #include #include diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index dfa9d369e8fc..149766ecd68b 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -70,6 +70,14 @@ enum vm_guest_mode { #define vm_guest_mode_string(m) vm_guest_mode_string[m] extern const char * const vm_guest_mode_string[]; +struct vm_guest_mode_params { + unsigned int pa_bits; + unsigned int va_bits; + unsigned int page_size; + unsigned int page_shift; +}; +extern const struct vm_guest_mode_params vm_guest_mode_params[]; + enum vm_mem_backing_src_type { VM_MEM_SRC_ANONYMOUS, VM_MEM_SRC_ANONYMOUS_THP, diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 239421e4f6b8..cd4c258f458d 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -13,9 +13,6 @@ #define MAX_VCPUS 512 -#define PAGE_SHIFT_4K 12 -#define PTES_PER_4K_PT 512 - #define TEST_MEM_SLOT_INDEX 1 /* Default guest test virtual memory offset */ @@ -94,41 +91,26 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus, uint64_t vcpu_memory_bytes) { struct kvm_vm *vm; - uint64_t pages = DEFAULT_GUEST_PHY_PAGES; uint64_t guest_num_pages; - /* Account for a few pages per-vCPU for stacks */ - pages += DEFAULT_STACK_PGS * vcpus; - - /* - * Reserve twice the ammount of memory needed to map the test region and - * the page table / stacks region, at 4k, for page tables. Do the - * calculation with 4K page size: the smallest of all archs. (e.g., 64K - * page size guest will need even less memory for page tables). - */ - pages += (2 * pages) / PTES_PER_4K_PT; - pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) / - PTES_PER_4K_PT; - pages = vm_adjust_num_guest_pages(mode, pages); - pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = vm_create(mode, pages, O_RDWR); - kvm_vm_elf_load(vm, program_invocation_name, 0, 0); -#ifdef __x86_64__ - vm_create_irqchip(vm); -#endif - - perf_test_args.vm = vm; - perf_test_args.guest_page_size = vm_get_page_size(vm); perf_test_args.host_page_size = getpagesize(); + perf_test_args.guest_page_size = vm_guest_mode_params[mode].page_size; + guest_num_pages = vm_adjust_num_guest_pages(mode, + (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size); + + TEST_ASSERT(vcpu_memory_bytes % perf_test_args.host_page_size == 0, + "Guest memory size is not host page size aligned."); TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, "Guest memory size is not guest page size aligned."); - guest_num_pages = (vcpus * vcpu_memory_bytes) / - perf_test_args.guest_page_size; - guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + vm = vm_create_with_vcpus(mode, vcpus, + (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, + 0, guest_code, NULL); + + perf_test_args.vm = vm; /* * If there should be more memory in the guest test region than there @@ -140,18 +122,13 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus, guest_num_pages, vm_get_max_gfn(vm), vcpus, vcpu_memory_bytes); - TEST_ASSERT(vcpu_memory_bytes % perf_test_args.host_page_size == 0, - "Guest memory size is not host page size aligned."); - guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * perf_test_args.guest_page_size; guest_test_phys_mem &= ~(perf_test_args.host_page_size - 1); - #ifdef __s390x__ /* Align to 1M (segment size) */ guest_test_phys_mem &= ~((1 << 20) - 1); #endif - pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); /* Add an extra memory slot for testing */ @@ -177,8 +154,6 @@ static void add_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes) for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; - vm_vcpu_add_default(vm, vcpu_id, guest_code); - vcpu_args->vcpu_id = vcpu_id; vcpu_args->gva = guest_test_virt_mem + (vcpu_id * vcpu_memory_bytes); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 88ef7067f1e6..fa5a90e6c6f0 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -153,14 +153,7 @@ const char * const vm_guest_mode_string[] = { _Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES, "Missing new mode strings?"); -struct vm_guest_mode_params { - unsigned int pa_bits; - unsigned int va_bits; - unsigned int page_size; - unsigned int page_shift; -}; - -static const struct vm_guest_mode_params vm_guest_mode_params[] = { +const struct vm_guest_mode_params vm_guest_mode_params[] = { { 52, 48, 0x1000, 12 }, { 52, 48, 0x10000, 16 }, { 48, 48, 0x1000, 12 }, From b268b6f0bd36322358accb15c45683a9e1220231 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 18 Dec 2020 15:17:34 +0100 Subject: [PATCH 494/547] KVM: selftests: Implement perf_test_util more conventionally It's not conventional C to put non-inline functions in header files. Create a source file for the functions instead. Also reduce the amount of globals and rename the functions to something less generic. Reviewed-by: Ben Gardon Reviewed-by: Peter Xu Signed-off-by: Andrew Jones Message-Id: <20201218141734.54359-4-drjones@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 2 +- .../selftests/kvm/demand_paging_test.c | 11 +- .../selftests/kvm/dirty_log_perf_test.c | 22 +-- .../testing/selftests/kvm/include/kvm_util.h | 1 + .../selftests/kvm/include/perf_test_util.h | 142 ++---------------- .../selftests/kvm/lib/perf_test_util.c | 134 +++++++++++++++++ 6 files changed, 166 insertions(+), 146 deletions(-) create mode 100644 tools/testing/selftests/kvm/lib/perf_test_util.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a7286a08c3ae..fe41c6a0fa67 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -33,7 +33,7 @@ ifeq ($(ARCH),s390) UNAME_M := s390x endif -LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c +LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c lib/guest_modes.c lib/perf_test_util.c LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index b0c41de32e9b..cdad1eca72f7 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -36,12 +36,14 @@ #define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__) #endif +static int nr_vcpus = 1; +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; static char *guest_data_prototype; static void *vcpu_worker(void *data) { int ret; - struct vcpu_args *vcpu_args = (struct vcpu_args *)data; + struct perf_test_vcpu_args *vcpu_args = (struct perf_test_vcpu_args *)data; int vcpu_id = vcpu_args->vcpu_id; struct kvm_vm *vm = perf_test_args.vm; struct kvm_run *run; @@ -263,7 +265,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) int vcpu_id; int r; - vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size); + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size); perf_test_args.wr_fract = 1; @@ -275,7 +277,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - add_vcpus(vm, nr_vcpus, guest_percpu_mem_size); + perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size); if (p->use_uffd) { uffd_handler_threads = @@ -359,8 +361,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) perf_test_args.vcpu_args[0].pages * nr_vcpus / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); - ucall_uninit(vm); - kvm_vm_free(vm); + perf_test_destroy_vm(vm); free(guest_data_prototype); free(vcpu_threads); diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 36bea75a8d6f..2283a0ec74a9 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -22,11 +22,14 @@ /* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/ #define TEST_HOST_LOOP_N 2UL +static int nr_vcpus = 1; +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; + /* Host variables */ static u64 dirty_log_manual_caps; static bool host_quit; static uint64_t iteration; -static uint64_t vcpu_last_completed_iteration[MAX_VCPUS]; +static uint64_t vcpu_last_completed_iteration[KVM_MAX_VCPUS]; static void *vcpu_worker(void *data) { @@ -38,7 +41,7 @@ static void *vcpu_worker(void *data) struct timespec ts_diff; struct timespec total = (struct timespec){0}; struct timespec avg; - struct vcpu_args *vcpu_args = (struct vcpu_args *)data; + struct perf_test_vcpu_args *vcpu_args = (struct perf_test_vcpu_args *)data; int vcpu_id = vcpu_args->vcpu_id; vcpu_args_set(vm, vcpu_id, 1, vcpu_id); @@ -108,7 +111,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct kvm_enable_cap cap = {}; struct timespec clear_dirty_log_total = (struct timespec){0}; - vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size); + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size); perf_test_args.wr_fract = p->wr_fract; @@ -126,7 +129,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads)); TEST_ASSERT(vcpu_threads, "Memory allocation failed"); - add_vcpus(vm, nr_vcpus, guest_percpu_mem_size); + perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size); sync_global_to_guest(vm, perf_test_args); @@ -152,7 +155,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Enable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); - vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, + vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, KVM_MEM_LOG_DIRTY_PAGES); ts_diff = timespec_diff_now(start); pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", @@ -179,7 +182,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) iteration, ts_diff.tv_sec, ts_diff.tv_nsec); clock_gettime(CLOCK_MONOTONIC, &start); - kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap); + kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap); ts_diff = timespec_diff_now(start); get_dirty_log_total = timespec_add(get_dirty_log_total, @@ -189,7 +192,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (dirty_log_manual_caps) { clock_gettime(CLOCK_MONOTONIC, &start); - kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0, + kvm_vm_clear_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap, 0, host_num_pages); ts_diff = timespec_diff_now(start); @@ -207,7 +210,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Disable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); - vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0); + vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, 0); ts_diff = timespec_diff_now(start); pr_info("Disabling dirty logging time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -226,8 +229,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) free(bmap); free(vcpu_threads); - ucall_uninit(vm); - kvm_vm_free(vm); + perf_test_destroy_vm(vm); } static void help(char *name) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 149766ecd68b..5cbb861525ed 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -16,6 +16,7 @@ #include "sparsebit.h" +#define KVM_MAX_VCPUS 512 /* * Callers of kvm_util only have an incomplete/opaque description of the diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index cd4c258f458d..b1188823c31b 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -9,35 +9,15 @@ #define SELFTEST_KVM_PERF_TEST_UTIL_H #include "kvm_util.h" -#include "processor.h" - -#define MAX_VCPUS 512 - -#define TEST_MEM_SLOT_INDEX 1 /* Default guest test virtual memory offset */ #define DEFAULT_GUEST_TEST_MEM 0xc0000000 #define DEFAULT_PER_VCPU_MEM_SIZE (1 << 30) /* 1G */ -/* - * Guest physical memory offset of the testing memory slot. - * This will be set to the topmost valid physical address minus - * the test memory size. - */ -static uint64_t guest_test_phys_mem; +#define PERF_TEST_MEM_SLOT_INDEX 1 -/* - * Guest virtual memory offset of the testing memory slot. - * Must not conflict with identity mapped test code. - */ -static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; -static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; - -/* Number of VCPUs for the test */ -static int nr_vcpus = 1; - -struct vcpu_args { +struct perf_test_vcpu_args { uint64_t gva; uint64_t pages; @@ -51,119 +31,21 @@ struct perf_test_args { uint64_t guest_page_size; int wr_fract; - struct vcpu_args vcpu_args[MAX_VCPUS]; + struct perf_test_vcpu_args vcpu_args[KVM_MAX_VCPUS]; }; -static struct perf_test_args perf_test_args; +extern struct perf_test_args perf_test_args; /* - * Continuously write to the first 8 bytes of each page in the - * specified region. + * Guest physical memory offset of the testing memory slot. + * This will be set to the topmost valid physical address minus + * the test memory size. */ -static void guest_code(uint32_t vcpu_id) -{ - struct vcpu_args *vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; - uint64_t gva; - uint64_t pages; - int i; +extern uint64_t guest_test_phys_mem; - /* Make sure vCPU args data structure is not corrupt. */ - GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id); - - gva = vcpu_args->gva; - pages = vcpu_args->pages; - - while (true) { - for (i = 0; i < pages; i++) { - uint64_t addr = gva + (i * perf_test_args.guest_page_size); - - if (i % perf_test_args.wr_fract == 0) - *(uint64_t *)addr = 0x0123456789ABCDEF; - else - READ_ONCE(*(uint64_t *)addr); - } - - GUEST_SYNC(1); - } -} - -static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes) -{ - struct kvm_vm *vm; - uint64_t guest_num_pages; - - pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - - perf_test_args.host_page_size = getpagesize(); - perf_test_args.guest_page_size = vm_guest_mode_params[mode].page_size; - - guest_num_pages = vm_adjust_num_guest_pages(mode, - (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size); - - TEST_ASSERT(vcpu_memory_bytes % perf_test_args.host_page_size == 0, - "Guest memory size is not host page size aligned."); - TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, - "Guest memory size is not guest page size aligned."); - - vm = vm_create_with_vcpus(mode, vcpus, - (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, - 0, guest_code, NULL); - - perf_test_args.vm = vm; - - /* - * If there should be more memory in the guest test region than there - * can be pages in the guest, it will definitely cause problems. - */ - TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), - "Requested more guest memory than address space allows.\n" - " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", - guest_num_pages, vm_get_max_gfn(vm), vcpus, - vcpu_memory_bytes); - - guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * - perf_test_args.guest_page_size; - guest_test_phys_mem &= ~(perf_test_args.host_page_size - 1); -#ifdef __s390x__ - /* Align to 1M (segment size) */ - guest_test_phys_mem &= ~((1 << 20) - 1); -#endif - pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); - - /* Add an extra memory slot for testing */ - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - guest_test_phys_mem, - TEST_MEM_SLOT_INDEX, - guest_num_pages, 0); - - /* Do mapping for the demand paging memory slot */ - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); - - ucall_init(vm, NULL); - - return vm; -} - -static void add_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes) -{ - vm_paddr_t vcpu_gpa; - struct vcpu_args *vcpu_args; - int vcpu_id; - - for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { - vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; - - vcpu_args->vcpu_id = vcpu_id; - vcpu_args->gva = guest_test_virt_mem + - (vcpu_id * vcpu_memory_bytes); - vcpu_args->pages = vcpu_memory_bytes / - perf_test_args.guest_page_size; - - vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); - pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", - vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes); - } -} +struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, + uint64_t vcpu_memory_bytes); +void perf_test_destroy_vm(struct kvm_vm *vm); +void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes); #endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */ diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c new file mode 100644 index 000000000000..9be1944c2d1c --- /dev/null +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020, Google LLC. + */ + +#include "kvm_util.h" +#include "perf_test_util.h" +#include "processor.h" + +struct perf_test_args perf_test_args; + +uint64_t guest_test_phys_mem; + +/* + * Guest virtual memory offset of the testing memory slot. + * Must not conflict with identity mapped test code. + */ +static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; + +/* + * Continuously write to the first 8 bytes of each page in the + * specified region. + */ +static void guest_code(uint32_t vcpu_id) +{ + struct perf_test_vcpu_args *vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; + uint64_t gva; + uint64_t pages; + int i; + + /* Make sure vCPU args data structure is not corrupt. */ + GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id); + + gva = vcpu_args->gva; + pages = vcpu_args->pages; + + while (true) { + for (i = 0; i < pages; i++) { + uint64_t addr = gva + (i * perf_test_args.guest_page_size); + + if (i % perf_test_args.wr_fract == 0) + *(uint64_t *)addr = 0x0123456789ABCDEF; + else + READ_ONCE(*(uint64_t *)addr); + } + + GUEST_SYNC(1); + } +} + +struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, + uint64_t vcpu_memory_bytes) +{ + struct kvm_vm *vm; + uint64_t guest_num_pages; + + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); + + perf_test_args.host_page_size = getpagesize(); + perf_test_args.guest_page_size = vm_guest_mode_params[mode].page_size; + + guest_num_pages = vm_adjust_num_guest_pages(mode, + (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size); + + TEST_ASSERT(vcpu_memory_bytes % perf_test_args.host_page_size == 0, + "Guest memory size is not host page size aligned."); + TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, + "Guest memory size is not guest page size aligned."); + + vm = vm_create_with_vcpus(mode, vcpus, + (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, + 0, guest_code, NULL); + + perf_test_args.vm = vm; + + /* + * If there should be more memory in the guest test region than there + * can be pages in the guest, it will definitely cause problems. + */ + TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm), + "Requested more guest memory than address space allows.\n" + " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n", + guest_num_pages, vm_get_max_gfn(vm), vcpus, + vcpu_memory_bytes); + + guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) * + perf_test_args.guest_page_size; + guest_test_phys_mem &= ~(perf_test_args.host_page_size - 1); +#ifdef __s390x__ + /* Align to 1M (segment size) */ + guest_test_phys_mem &= ~((1 << 20) - 1); +#endif + pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); + + /* Add an extra memory slot for testing */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + guest_test_phys_mem, + PERF_TEST_MEM_SLOT_INDEX, + guest_num_pages, 0); + + /* Do mapping for the demand paging memory slot */ + virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0); + + ucall_init(vm, NULL); + + return vm; +} + +void perf_test_destroy_vm(struct kvm_vm *vm) +{ + ucall_uninit(vm); + kvm_vm_free(vm); +} + +void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes) +{ + vm_paddr_t vcpu_gpa; + struct perf_test_vcpu_args *vcpu_args; + int vcpu_id; + + for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) { + vcpu_args = &perf_test_args.vcpu_args[vcpu_id]; + + vcpu_args->vcpu_id = vcpu_id; + vcpu_args->gva = guest_test_virt_mem + + (vcpu_id * vcpu_memory_bytes); + vcpu_args->pages = vcpu_memory_bytes / + perf_test_args.guest_page_size; + + vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes); + pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", + vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes); + } +} From 2f80d502d627f30257ba7e3655e71c373b7d1a5a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 22 Dec 2020 05:20:43 -0500 Subject: [PATCH 495/547] KVM: x86: fix shift out of bounds reported by UBSAN Since we know that e >= s, we can reassociate the left shift, changing the shifted number from 1 to 2 in exchange for decreasing the right hand side by 1. Reported-by: syzbot+e87846c48bf72bc85311@syzkaller.appspotmail.com Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 9c4a9c8e43d9..581925e476d6 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -49,7 +49,7 @@ static inline u64 rsvd_bits(int s, int e) if (e < s) return 0; - return ((1ULL << (e - s + 1)) - 1) << s; + return ((2ULL << (e - s)) - 1) << s; } void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 access_mask); From 7f0c1f1a8277de906a242a6ef907476149f006de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Jan 2021 10:29:16 -0800 Subject: [PATCH 496/547] MAINTAINERS: Really update email address for Sean Christopherson Use my @google.com address in MAINTAINERS, somehow only the .mailmap entry was added when the original update patch was applied. Fixes: c2b1209d852f ("MAINTAINERS: Update email address for Sean Christopherson") Cc: kvm@vger.kernel.org Reported-by: Nathan Chancellor Signed-off-by: Sean Christopherson Message-Id: <20210106182916.331743-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 281de213ef47..9d8b77332ae3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9672,7 +9672,7 @@ F: tools/testing/selftests/kvm/s390x/ KERNEL VIRTUAL MACHINE FOR X86 (KVM/x86) M: Paolo Bonzini -R: Sean Christopherson +R: Sean Christopherson R: Vitaly Kuznetsov R: Wanpeng Li R: Jim Mattson From de7860c8a388e4cb757c7da26889b9e2641ffcfe Mon Sep 17 00:00:00 2001 From: Stephen Zhang Date: Fri, 18 Dec 2020 15:51:37 +0800 Subject: [PATCH 497/547] KVM: x86: change in pv_eoi_get_pending() to make code more readable Signed-off-by: Stephen Zhang Message-Id: <1608277897-1932-1-git-send-email-stephenzhangzsd@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3136e05831cf..78823227c592 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -674,7 +674,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) (unsigned long long)vcpu->arch.pv_eoi.msr_val); return false; } - return val & 0x1; + return val & KVM_PV_EOI_ENABLED; } static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) From 88bf56d04bc3564542049ec4ec168a8b60d0b48c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 17 Dec 2020 23:41:18 +0800 Subject: [PATCH 498/547] kvm: check tlbs_dirty directly In kvm_mmu_notifier_invalidate_range_start(), tlbs_dirty is used as: need_tlb_flush |= kvm->tlbs_dirty; with need_tlb_flush's type being int and tlbs_dirty's type being long. It means that tlbs_dirty is always used as int and the higher 32 bits is useless. We need to check tlbs_dirty in a correct way and this change checks it directly without propagating it to need_tlb_flush. Note: it's _extremely_ unlikely this neglecting of higher 32 bits can cause problems in practice. It would require encountering tlbs_dirty on a 4 billion count boundary, and KVM would need to be using shadow paging or be running a nested guest. Cc: stable@vger.kernel.org Fixes: a4ee1ca4a36e ("KVM: MMU: delay flush all tlbs on sync_page path") Signed-off-by: Lai Jiangshan Message-Id: <20201217154118.16497-1-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3abcb2ce5b7d..19dae28904f7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -485,9 +485,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm->mmu_notifier_count++; need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, range->flags); - need_tlb_flush |= kvm->tlbs_dirty; /* we've to flush the tlb before the pages can be freed */ - if (need_tlb_flush) + if (need_tlb_flush || kvm->tlbs_dirty) kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); From a889ea54b3daa63ee1463dc19ed699407d61458b Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 6 Jan 2021 16:19:34 -0800 Subject: [PATCH 499/547] KVM: x86/mmu: Ensure TDP MMU roots are freed after yield Many TDP MMU functions which need to perform some action on all TDP MMU roots hold a reference on that root so that they can safely drop the MMU lock in order to yield to other threads. However, when releasing the reference on the root, there is a bug: the root will not be freed even if its reference count (root_count) is reduced to 0. To simplify acquiring and releasing references on TDP MMU root pages, and to ensure that these roots are properly freed, move the get/put operations into another TDP MMU root iterator macro. Moving the get/put operations into an iterator macro also helps simplify control flow when a root does need to be freed. Note that using the list_for_each_entry_safe macro would not have been appropriate in this situation because it could keep a pointer to the next root across an MMU lock release + reacquire, during which time that root could be freed. Reported-by: Maciej S. Szmigiero Suggested-by: Paolo Bonzini Fixes: faaf05b00aec ("kvm: x86/mmu: Support zapping SPTEs in the TDP MMU") Fixes: 063afacd8730 ("kvm: x86/mmu: Support invalidate range MMU notifier for TDP MMU") Fixes: a6a0b05da9f3 ("kvm: x86/mmu: Support dirty logging for the TDP MMU") Fixes: 14881998566d ("kvm: x86/mmu: Support disabling dirty logging for the tdp MMU") Signed-off-by: Ben Gardon Message-Id: <20210107001935.3732070-1-bgardon@google.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 104 +++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 56 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6574af2d0994..2ef8615f9dba 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -44,7 +44,48 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); } -#define for_each_tdp_mmu_root(_kvm, _root) \ +static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) +{ + if (kvm_mmu_put_root(kvm, root)) + kvm_tdp_mmu_free_root(kvm, root); +} + +static inline bool tdp_mmu_next_root_valid(struct kvm *kvm, + struct kvm_mmu_page *root) +{ + lockdep_assert_held(&kvm->mmu_lock); + + if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link)) + return false; + + kvm_mmu_get_root(kvm, root); + return true; + +} + +static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, + struct kvm_mmu_page *root) +{ + struct kvm_mmu_page *next_root; + + next_root = list_next_entry(root, link); + tdp_mmu_put_root(kvm, root); + return next_root; +} + +/* + * Note: this iterator gets and puts references to the roots it iterates over. + * This makes it safe to release the MMU lock and yield within the loop, but + * if exiting the loop early, the caller must drop the reference to the most + * recent root. (Unless keeping a live reference is desirable.) + */ +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ + for (_root = list_first_entry(&_kvm->arch.tdp_mmu_roots, \ + typeof(*_root), link); \ + tdp_mmu_next_root_valid(_kvm, _root); \ + _root = tdp_mmu_next_root(_kvm, _root)) + +#define for_each_tdp_mmu_root(_kvm, _root) \ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) @@ -447,18 +488,9 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) struct kvm_mmu_page *root; bool flush = false; - for_each_tdp_mmu_root(kvm, root) { - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - + for_each_tdp_mmu_root_yield_safe(kvm, root) flush |= zap_gfn_range(kvm, root, start, end, true); - kvm_mmu_put_root(kvm, root); - } - return flush; } @@ -619,13 +651,7 @@ static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, int ret = 0; int as_id; - for_each_tdp_mmu_root(kvm, root) { - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - + for_each_tdp_mmu_root_yield_safe(kvm, root) { as_id = kvm_mmu_page_as_id(root); slots = __kvm_memslots(kvm, as_id); kvm_for_each_memslot(memslot, slots) { @@ -647,8 +673,6 @@ static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, ret |= handler(kvm, memslot, root, gfn_start, gfn_end, data); } - - kvm_mmu_put_root(kvm, root); } return ret; @@ -838,21 +862,13 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root(kvm, root) { + for_each_tdp_mmu_root_yield_safe(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) continue; - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages, min_level); - - kvm_mmu_put_root(kvm, root); } return spte_set; @@ -906,21 +922,13 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root(kvm, root) { + for_each_tdp_mmu_root_yield_safe(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) continue; - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); - - kvm_mmu_put_root(kvm, root); } return spte_set; @@ -1029,21 +1037,13 @@ bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) int root_as_id; bool spte_set = false; - for_each_tdp_mmu_root(kvm, root) { + for_each_tdp_mmu_root_yield_safe(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) continue; - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); - - kvm_mmu_put_root(kvm, root); } return spte_set; } @@ -1089,21 +1089,13 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, struct kvm_mmu_page *root; int root_as_id; - for_each_tdp_mmu_root(kvm, root) { + for_each_tdp_mmu_root_yield_safe(kvm, root) { root_as_id = kvm_mmu_page_as_id(root); if (root_as_id != slot->as_id) continue; - /* - * Take a reference on the root so that it cannot be freed if - * this thread releases the MMU lock and yields in this loop. - */ - kvm_mmu_get_root(kvm, root); - zap_collapsible_spte_range(kvm, root, slot->base_gfn, slot->base_gfn + slot->npages); - - kvm_mmu_put_root(kvm, root); } } From c0dba6e46825716db15c4b3a8f05c85b4a59edda Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Wed, 6 Jan 2021 16:19:35 -0800 Subject: [PATCH 500/547] KVM: x86/mmu: Clarify TDP MMU page list invariants The tdp_mmu_roots and tdp_mmu_pages in struct kvm_arch should only contain pages with tdp_mmu_page set to true. tdp_mmu_pages should not contain any pages with a non-zero root_count and tdp_mmu_roots should only contain pages with a positive root_count, unless a thread holds the MMU lock and is in the process of modifying the list. Various functions expect these invariants to be maintained, but they are not explictily documented. Add to the comments on both fields to document the above invariants. Signed-off-by: Ben Gardon Message-Id: <20210107001935.3732070-2-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3ab7b46087b7..afed3da3b3a0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1010,9 +1010,21 @@ struct kvm_arch { */ bool tdp_mmu_enabled; - /* List of struct tdp_mmu_pages being used as roots */ + /* + * List of struct kvmp_mmu_pages being used as roots. + * All struct kvm_mmu_pages in the list should have + * tdp_mmu_page set. + * All struct kvm_mmu_pages in the list should have a positive + * root_count except when a thread holds the MMU lock and is removing + * an entry from the list. + */ struct list_head tdp_mmu_roots; - /* List of struct tdp_mmu_pages not being used as roots */ + + /* + * List of struct kvmp_mmu_pages not being used as roots. + * All struct kvm_mmu_pages in the list should have + * tdp_mmu_page set and a root_count of 0. + */ struct list_head tdp_mmu_pages; }; From 81f76adad560dfc39cb9625cf1e00a7e2b7b88df Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 7 Jan 2021 11:38:52 +0200 Subject: [PATCH 501/547] KVM: nSVM: correctly restore nested_run_pending on migration The code to store it on the migration exists, but no code was restoring it. One of the side effects of fixing this is that L1->L2 injected events are no longer lost when migration happens with nested run pending. Signed-off-by: Maxim Levitsky Message-Id: <20210107093854.882483-3-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index b0b667456b2e..a466336aab43 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1194,6 +1194,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, * in the registers, the save area of the nested state instead * contains saved L1 state. */ + + svm->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); hsave->save = *save; From 56fe28de8c4f0167275c411c0daa5709e9a47bd7 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 7 Jan 2021 11:38:54 +0200 Subject: [PATCH 502/547] KVM: nSVM: mark vmcb as dirty when forcingly leaving the guest mode We overwrite most of vmcb fields while doing so, so we must mark it as dirty. Signed-off-by: Maxim Levitsky Message-Id: <20210107093854.882483-5-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index a466336aab43..a622e63739b4 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -754,6 +754,7 @@ void svm_leave_nested(struct vcpu_svm *svm) leave_guest_mode(&svm->vcpu); copy_vmcb_control_area(&vmcb->control, &hsave->control); nested_svm_uninit_mmu_context(&svm->vcpu); + vmcb_mark_all_dirty(svm->vmcb); } kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); From f2c7ef3ba9556d62a7e2bb23b563c6510007d55c Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 7 Jan 2021 11:38:51 +0200 Subject: [PATCH 503/547] KVM: nSVM: cancel KVM_REQ_GET_NESTED_STATE_PAGES on nested vmexit It is possible to exit the nested guest mode, entered by svm_set_nested_state prior to first vm entry to it (e.g due to pending event) if the nested run was not pending during the migration. In this case we must not switch to the nested msr permission bitmap. Also add a warning to catch similar cases in the future. Fixes: a7d5c7ce41ac1 ("KVM: nSVM: delay MSR permission processing to first nested VM run") Signed-off-by: Maxim Levitsky Message-Id: <20210107093854.882483-2-mlevitsk@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 3 +++ arch/x86/kvm/vmx/nested.c | 2 ++ arch/x86/kvm/x86.c | 4 +++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index a622e63739b4..cb4c6ee10029 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -199,6 +199,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + if (!nested_svm_vmrun_msrpm(svm)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = @@ -595,6 +596,8 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm->nested.vmcb12_gpa = 0; WARN_ON_ONCE(svm->nested.nested_run_pending); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, &svm->vcpu); + /* in case we halted in L2 */ svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e2f26564a12d..0fbb46990dfc 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4442,6 +4442,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + /* Service the TLB flush request for L2 before switching to L1. */ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e28ab76b80dc..f6e7b25c40e2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8789,7 +8789,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { + if (WARN_ON_ONCE(!is_guest_mode(vcpu))) + ; + else if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; goto out; } From 647daca25d24fb6eadc7b6cd680ad3e6eed0f3d5 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 4 Jan 2021 14:20:01 -0600 Subject: [PATCH 504/547] KVM: SVM: Add support for booting APs in an SEV-ES guest Typically under KVM, an AP is booted using the INIT-SIPI-SIPI sequence, where the guest vCPU register state is updated and then the vCPU is VMRUN to begin execution of the AP. For an SEV-ES guest, this won't work because the guest register state is encrypted. Following the GHCB specification, the hypervisor must not alter the guest register state, so KVM must track an AP/vCPU boot. Should the guest want to park the AP, it must use the AP Reset Hold exit event in place of, for example, a HLT loop. First AP boot (first INIT-SIPI-SIPI sequence): Execute the AP (vCPU) as it was initialized and measured by the SEV-ES support. It is up to the guest to transfer control of the AP to the proper location. Subsequent AP boot: KVM will expect to receive an AP Reset Hold exit event indicating that the vCPU is being parked and will require an INIT-SIPI-SIPI sequence to awaken it. When the AP Reset Hold exit event is received, KVM will place the vCPU into a simulated HLT mode. Upon receiving the INIT-SIPI-SIPI sequence, KVM will make the vCPU runnable. It is again up to the guest to then transfer control of the AP to the proper location. To differentiate between an actual HLT and an AP Reset Hold, a new MP state is introduced, KVM_MP_STATE_AP_RESET_HOLD, which the vCPU is placed in upon receiving the AP Reset Hold exit event. Additionally, to communicate the AP Reset Hold exit event up to userspace (if needed), a new exit reason is introduced, KVM_EXIT_AP_RESET_HOLD. A new x86 ops function is introduced, vcpu_deliver_sipi_vector, in order to accomplish AP booting. For VMX, vcpu_deliver_sipi_vector is set to the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(). SVM adds a new function that, for non SEV-ES guests, invokes the original SIPI delivery function, kvm_vcpu_deliver_sipi_vector(), but for SEV-ES guests, implements the logic above. Signed-off-by: Tom Lendacky Message-Id: Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm/sev.c | 22 ++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 10 ++++++++++ arch/x86/kvm/svm/svm.h | 2 ++ arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/x86.c | 26 +++++++++++++++++++++----- include/uapi/linux/kvm.h | 2 ++ 8 files changed, 63 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index afed3da3b3a0..3d6616f6f6ef 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1299,6 +1299,8 @@ struct kvm_x86_ops { void (*migrate_timers)(struct kvm_vcpu *vcpu); void (*msr_filter_changed)(struct kvm_vcpu *vcpu); int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); + + void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector); }; struct kvm_x86_nested_ops { @@ -1480,6 +1482,7 @@ int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_vcpu_halt(struct kvm_vcpu *vcpu); +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 78823227c592..43cceadd073e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2898,7 +2898,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); + kvm_x86_ops.vcpu_deliver_sipi_vector(vcpu, sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 563ced07b0b8..c8ffdbc81709 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1563,6 +1563,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) goto vmgexit_err; break; case SVM_VMGEXIT_NMI_COMPLETE: + case SVM_VMGEXIT_AP_HLT_LOOP: case SVM_VMGEXIT_AP_JUMP_TABLE: case SVM_VMGEXIT_UNSUPPORTED_EVENT: break; @@ -1888,6 +1889,9 @@ int sev_handle_vmgexit(struct vcpu_svm *svm) case SVM_VMGEXIT_NMI_COMPLETE: ret = svm_invoke_exit_handler(svm, SVM_EXIT_IRET); break; + case SVM_VMGEXIT_AP_HLT_LOOP: + ret = kvm_emulate_ap_reset_hold(&svm->vcpu); + break; case SVM_VMGEXIT_AP_JUMP_TABLE: { struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info; @@ -2040,3 +2044,21 @@ void sev_es_vcpu_put(struct vcpu_svm *svm) wrmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]); } } + +void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* First SIPI: Use the values as initially set by the VMM */ + if (!svm->received_first_sipi) { + svm->received_first_sipi = true; + return; + } + + /* + * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where + * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a + * non-zero value. + */ + ghcb_set_sw_exit_info_2(svm->ghcb, 1); +} diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6824d611dc5d..7ef171790d02 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4382,6 +4382,14 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT)); } +static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) +{ + if (!sev_es_guest(vcpu->kvm)) + return kvm_vcpu_deliver_sipi_vector(vcpu, vector); + + sev_vcpu_deliver_sipi_vector(vcpu, vector); +} + static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); @@ -4524,6 +4532,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .msr_filter_changed = svm_msr_filter_changed, .complete_emulated_msr = svm_complete_emulated_msr, + + .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, }; static struct kvm_x86_init_ops svm_init_ops __initdata = { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 5431e6335e2e..0fe874ae5498 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -185,6 +185,7 @@ struct vcpu_svm { struct vmcb_save_area *vmsa; struct ghcb *ghcb; struct kvm_host_map ghcb_map; + bool received_first_sipi; /* SEV-ES scratch area support */ void *ghcb_sa; @@ -591,6 +592,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_create_vcpu(struct vcpu_svm *svm); void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu); void sev_es_vcpu_put(struct vcpu_svm *svm); +void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); /* vmenter.S */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 75c9c6a0a3a4..2af05d3b0590 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7707,6 +7707,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .msr_filter_changed = vmx_msr_filter_changed, .complete_emulated_msr = kvm_complete_insn_gp, .cpu_dirty_log_size = vmx_cpu_dirty_log_size, + + .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; static __init int hardware_setup(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f6e7b25c40e2..0287840b93e0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7976,17 +7976,22 @@ void kvm_arch_exit(void) kmem_cache_destroy(x86_fpu_cache); } -int kvm_vcpu_halt(struct kvm_vcpu *vcpu) +int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) { ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + vcpu->arch.mp_state = state; return 1; } else { - vcpu->run->exit_reason = KVM_EXIT_HLT; + vcpu->run->exit_reason = reason; return 0; } } + +int kvm_vcpu_halt(struct kvm_vcpu *vcpu) +{ + return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); +} EXPORT_SYMBOL_GPL(kvm_vcpu_halt); int kvm_emulate_halt(struct kvm_vcpu *vcpu) @@ -8000,6 +8005,14 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + + return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); + #ifdef CONFIG_X86_64 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, unsigned long clock_type) @@ -9096,6 +9109,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) kvm_apic_accept_events(vcpu); switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: + case KVM_MP_STATE_AP_RESET_HOLD: vcpu->arch.pv.pv_unhalted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -9522,8 +9536,9 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, kvm_load_guest_fpu(vcpu); kvm_apic_accept_events(vcpu); - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && - vcpu->arch.pv.pv_unhalted) + if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED || + vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) && + vcpu->arch.pv.pv_unhalted) mp_state->mp_state = KVM_MP_STATE_RUNNABLE; else mp_state->mp_state = vcpu->arch.mp_state; @@ -10154,6 +10169,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); kvm_rip_write(vcpu, 0); } +EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); int kvm_arch_hardware_enable(void) { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 886802b8ffba..374c67875cdb 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -251,6 +251,7 @@ struct kvm_hyperv_exit { #define KVM_EXIT_X86_RDMSR 29 #define KVM_EXIT_X86_WRMSR 30 #define KVM_EXIT_DIRTY_RING_FULL 31 +#define KVM_EXIT_AP_RESET_HOLD 32 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -573,6 +574,7 @@ struct kvm_vapic_addr { #define KVM_MP_STATE_CHECK_STOP 6 #define KVM_MP_STATE_OPERATING 7 #define KVM_MP_STATE_LOAD 8 +#define KVM_MP_STATE_AP_RESET_HOLD 9 struct kvm_mp_state { __u32 mp_state; From 717df0f4cdc9044c415431a3522b3e9ccca5b4a3 Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Wed, 6 Jan 2021 09:59:06 +0530 Subject: [PATCH 505/547] chtls: Fix hardware tid leak send_abort_rpl() is not calculating cpl_abort_req_rss offset and ends up sending wrong TID with abort_rpl WR causng tid leaks. Replaced send_abort_rpl() with chtls_send_abort_rpl() as it is redundant. Fixes: cc35c88ae4db ("crypto : chtls - CPL handler definition") Signed-off-by: Rohit Maheshwari Signed-off-by: Ayush Sawal Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/chtls/chtls_cm.c | 39 ++----------------- 1 file changed, 3 insertions(+), 36 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index a0e0d8a83681..561b5f2273af 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -1997,39 +1997,6 @@ static void t4_defer_reply(struct sk_buff *skb, struct chtls_dev *cdev, spin_unlock_bh(&cdev->deferq.lock); } -static void send_abort_rpl(struct sock *sk, struct sk_buff *skb, - struct chtls_dev *cdev, int status, int queue) -{ - struct cpl_abort_req_rss *req = cplhdr(skb); - struct sk_buff *reply_skb; - struct chtls_sock *csk; - - csk = rcu_dereference_sk_user_data(sk); - - reply_skb = alloc_skb(sizeof(struct cpl_abort_rpl), - GFP_KERNEL); - - if (!reply_skb) { - req->status = (queue << 1); - t4_defer_reply(skb, cdev, send_defer_abort_rpl); - return; - } - - set_abort_rpl_wr(reply_skb, GET_TID(req), status); - kfree_skb(skb); - - set_wr_txq(reply_skb, CPL_PRIORITY_DATA, queue); - if (csk_conn_inline(csk)) { - struct l2t_entry *e = csk->l2t_entry; - - if (e && sk->sk_state != TCP_SYN_RECV) { - cxgb4_l2t_send(csk->egress_dev, reply_skb, e); - return; - } - } - cxgb4_ofld_send(cdev->lldi->ports[0], reply_skb); -} - static void chtls_send_abort_rpl(struct sock *sk, struct sk_buff *skb, struct chtls_dev *cdev, int status, int queue) @@ -2079,8 +2046,8 @@ static void bl_abort_syn_rcv(struct sock *lsk, struct sk_buff *skb) skb->sk = NULL; do_abort_syn_rcv(child, lsk); - send_abort_rpl(child, skb, BLOG_SKB_CB(skb)->cdev, - CPL_ABORT_NO_RST, queue); + chtls_send_abort_rpl(child, skb, BLOG_SKB_CB(skb)->cdev, + CPL_ABORT_NO_RST, queue); } static int abort_syn_rcv(struct sock *sk, struct sk_buff *skb) @@ -2111,7 +2078,7 @@ static int abort_syn_rcv(struct sock *sk, struct sk_buff *skb) int queue = csk->txq_idx; do_abort_syn_rcv(sk, psk); - send_abort_rpl(sk, skb, cdev, CPL_ABORT_NO_RST, queue); + chtls_send_abort_rpl(sk, skb, cdev, CPL_ABORT_NO_RST, queue); } else { skb->sk = sk; BLOG_SKB_CB(skb)->backlog_rcv = bl_abort_syn_rcv; From 827d329105bfde6701f0077e34a09c4a86e27145 Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Wed, 6 Jan 2021 09:59:07 +0530 Subject: [PATCH 506/547] chtls: Remove invalid set_tcb call At the time of SYN_RECV, connection information is not initialized at FW, updating tcb flag over uninitialized connection causes adapter crash. We don't need to update the flag during SYN_RECV state, so avoid this. Fixes: cc35c88ae4db ("crypto : chtls - CPL handler definition") Signed-off-by: Rohit Maheshwari Signed-off-by: Ayush Sawal Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 561b5f2273af..431d1e3844ab 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -2096,9 +2096,6 @@ static void chtls_abort_req_rss(struct sock *sk, struct sk_buff *skb) int queue = csk->txq_idx; if (is_neg_adv(req->status)) { - if (sk->sk_state == TCP_SYN_RECV) - chtls_set_tcb_tflag(sk, 0, 0); - kfree_skb(skb); return; } From 5a5fac9966bb6d513198634b0b1357be7e8447d2 Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Wed, 6 Jan 2021 09:59:08 +0530 Subject: [PATCH 507/547] chtls: Fix panic when route to peer not configured If route to peer is not configured, we might get non tls devices from dst_neigh_lookup() which is invalid, adding a check to avoid it. Fixes: cc35c88ae4db ("crypto : chtls - CPL handler definition") Signed-off-by: Rohit Maheshwari Signed-off-by: Ayush Sawal Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/chtls/chtls_cm.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 431d1e3844ab..04a8bd5af3b9 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -1109,6 +1109,7 @@ static struct sock *chtls_recv_sock(struct sock *lsk, const struct cpl_pass_accept_req *req, struct chtls_dev *cdev) { + struct adapter *adap = pci_get_drvdata(cdev->pdev); struct neighbour *n = NULL; struct inet_sock *newinet; const struct iphdr *iph; @@ -1118,9 +1119,10 @@ static struct sock *chtls_recv_sock(struct sock *lsk, struct dst_entry *dst; struct tcp_sock *tp; struct sock *newsk; + bool found = false; u16 port_id; int rxq_idx; - int step; + int step, i; iph = (const struct iphdr *)network_hdr; newsk = tcp_create_openreq_child(lsk, oreq, cdev->askb); @@ -1152,7 +1154,7 @@ static struct sock *chtls_recv_sock(struct sock *lsk, n = dst_neigh_lookup(dst, &ip6h->saddr); #endif } - if (!n) + if (!n || !n->dev) goto free_sk; ndev = n->dev; @@ -1161,6 +1163,13 @@ static struct sock *chtls_recv_sock(struct sock *lsk, if (is_vlan_dev(ndev)) ndev = vlan_dev_real_dev(ndev); + for_each_port(adap, i) + if (cdev->ports[i] == ndev) + found = true; + + if (!found) + goto free_dst; + port_id = cxgb4_port_idx(ndev); csk = chtls_sock_create(cdev); @@ -1238,6 +1247,7 @@ static struct sock *chtls_recv_sock(struct sock *lsk, free_csk: chtls_sock_release(&csk->kref); free_dst: + neigh_release(n); dst_release(dst); free_sk: inet_csk_prepare_forced_close(newsk); From f8d15d29d6e6b32704c8fce9229716ca145a0de2 Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Wed, 6 Jan 2021 09:59:09 +0530 Subject: [PATCH 508/547] chtls: Avoid unnecessary freeing of oreq pointer In chtls_pass_accept_request(), removing the chtls_reqsk_free() call to avoid oreq freeing twice. Here oreq is the pointer to struct request_sock. Fixes: cc35c88ae4db ("crypto : chtls - CPL handler definition") Signed-off-by: Rohit Maheshwari Signed-off-by: Ayush Sawal Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 04a8bd5af3b9..3022c802d09a 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -1397,7 +1397,7 @@ static void chtls_pass_accept_request(struct sock *sk, newsk = chtls_recv_sock(sk, oreq, network_hdr, req, cdev); if (!newsk) - goto free_oreq; + goto reject; if (chtls_get_module(newsk)) goto reject; @@ -1413,8 +1413,6 @@ static void chtls_pass_accept_request(struct sock *sk, kfree_skb(skb); return; -free_oreq: - chtls_reqsk_free(oreq); reject: mk_tid_release(reply_skb, 0, tid); cxgb4_ofld_send(cdev->lldi->ports[0], reply_skb); From a84b2c0d5fa23da6d6c8c0d5f5c93184a2744d3e Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Wed, 6 Jan 2021 09:59:10 +0530 Subject: [PATCH 509/547] chtls: Replace skb_dequeue with skb_peek The skb is unlinked twice, one in __skb_dequeue in function chtls_reset_synq() and another in cleanup_syn_rcv_conn(). So in this patch using skb_peek() instead of __skb_dequeue(), so that unlink will be handled only in cleanup_syn_rcv_conn(). Fixes: cc35c88ae4db ("crypto : chtls - CPL handler definition") Signed-off-by: Vinay Kumar Yadav Signed-off-by: Ayush Sawal Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 3022c802d09a..ff3969a24d74 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -621,7 +621,7 @@ static void chtls_reset_synq(struct listen_ctx *listen_ctx) while (!skb_queue_empty(&listen_ctx->synq)) { struct chtls_sock *csk = - container_of((struct synq *)__skb_dequeue + container_of((struct synq *)skb_peek (&listen_ctx->synq), struct chtls_sock, synq); struct sock *child = csk->sk; From eade1e0a4fb31d48eeb1589d9bb859ae4dd6181d Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Wed, 6 Jan 2021 09:59:11 +0530 Subject: [PATCH 510/547] chtls: Added a check to avoid NULL pointer dereference In case of server removal lookup_stid() may return NULL pointer, which is used as listen_ctx. So added a check before accessing this pointer. Fixes: cc35c88ae4db ("crypto : chtls - CPL handler definition") Signed-off-by: Vinay Kumar Yadav Signed-off-by: Ayush Sawal Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index ff3969a24d74..1c6d3c93a0c8 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -1597,6 +1597,11 @@ static int chtls_pass_establish(struct chtls_dev *cdev, struct sk_buff *skb) sk_wake_async(sk, 0, POLL_OUT); data = lookup_stid(cdev->tids, stid); + if (!data) { + /* listening server close */ + kfree_skb(skb); + goto unlock; + } lsk = ((struct listen_ctx *)data)->lsk; bh_lock_sock(lsk); From 15ef6b0e30b354253e2c10b3836bc59767eb162b Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Wed, 6 Jan 2021 09:59:12 +0530 Subject: [PATCH 511/547] chtls: Fix chtls resources release sequence CPL_ABORT_RPL is sent after releasing the resources by calling chtls_release_resources(sk); and chtls_conn_done(sk); eventually causing kernel panic. Fixing it by calling release in appropriate order. Fixes: cc35c88ae4db ("crypto : chtls - CPL handler definition") Signed-off-by: Vinay Kumar Yadav Signed-off-by: Ayush Sawal Signed-off-by: Jakub Kicinski --- .../net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 1c6d3c93a0c8..51dd030b3b36 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -2058,9 +2058,9 @@ static void bl_abort_syn_rcv(struct sock *lsk, struct sk_buff *skb) queue = csk->txq_idx; skb->sk = NULL; - do_abort_syn_rcv(child, lsk); chtls_send_abort_rpl(child, skb, BLOG_SKB_CB(skb)->cdev, CPL_ABORT_NO_RST, queue); + do_abort_syn_rcv(child, lsk); } static int abort_syn_rcv(struct sock *sk, struct sk_buff *skb) @@ -2090,8 +2090,8 @@ static int abort_syn_rcv(struct sock *sk, struct sk_buff *skb) if (!sock_owned_by_user(psk)) { int queue = csk->txq_idx; - do_abort_syn_rcv(sk, psk); chtls_send_abort_rpl(sk, skb, cdev, CPL_ABORT_NO_RST, queue); + do_abort_syn_rcv(sk, psk); } else { skb->sk = sk; BLOG_SKB_CB(skb)->backlog_rcv = bl_abort_syn_rcv; @@ -2135,12 +2135,12 @@ static void chtls_abort_req_rss(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_SYN_RECV && !abort_syn_rcv(sk, skb)) return; - chtls_release_resources(sk); - chtls_conn_done(sk); } chtls_send_abort_rpl(sk, skb, BLOG_SKB_CB(skb)->cdev, rst_status, queue); + chtls_release_resources(sk); + chtls_conn_done(sk); } static void chtls_abort_rpl_rss(struct sock *sk, struct sk_buff *skb) From ac7996d680d8b4a51bb99bbdcee3dc838b985498 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 7 Jan 2021 12:39:16 +0000 Subject: [PATCH 512/547] octeontx2-af: fix memory leak of lmac and lmac->name Currently the error return paths don't kfree lmac and lmac->name leading to some memory leaks. Fix this by adding two error return paths that kfree these objects Addresses-Coverity: ("Resource leak") Fixes: 1463f382f58d ("octeontx2-af: Add support for CGX link management") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20210107123916.189748-1-colin.king@canonical.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/cgx.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 7d0f96290943..1a8f5a039d50 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -871,8 +871,10 @@ static int cgx_lmac_init(struct cgx *cgx) if (!lmac) return -ENOMEM; lmac->name = kcalloc(1, sizeof("cgx_fwi_xxx_yyy"), GFP_KERNEL); - if (!lmac->name) - return -ENOMEM; + if (!lmac->name) { + err = -ENOMEM; + goto err_lmac_free; + } sprintf(lmac->name, "cgx_fwi_%d_%d", cgx->cgx_id, i); lmac->lmac_id = i; lmac->cgx = cgx; @@ -883,7 +885,7 @@ static int cgx_lmac_init(struct cgx *cgx) CGX_LMAC_FWI + i * 9), cgx_fwi_event_handler, 0, lmac->name, lmac); if (err) - return err; + goto err_irq; /* Enable interrupt */ cgx_write(cgx, lmac->lmac_id, CGXX_CMRX_INT_ENA_W1S, @@ -895,6 +897,12 @@ static int cgx_lmac_init(struct cgx *cgx) } return cgx_lmac_verify_fwi_version(cgx); + +err_irq: + kfree(lmac->name); +err_lmac_free: + kfree(lmac); + return err; } static int cgx_lmac_exit(struct cgx *cgx) From 07e61a979ca4dddb3661f59328b3cd109f6b0070 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 7 Jan 2021 16:48:21 +0200 Subject: [PATCH 513/547] nexthop: Fix off-by-one error in error path A reference was not taken for the current nexthop entry, so do not try to put it in the error path. Fixes: 430a049190de ("nexthop: Add support for nexthop groups") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 5e1b22d4f939..f8035cfa9c20 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1459,7 +1459,7 @@ static struct nexthop *nexthop_create_group(struct net *net, return nh; out_no_nh: - for (; i >= 0; --i) + for (i--; i >= 0; --i) nexthop_put(nhg->nh_entries[i].nh); kfree(nhg->spare); From 7b01e53eee6dce7a8a6736e06b99b68cd0cc7a27 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 7 Jan 2021 16:48:22 +0200 Subject: [PATCH 514/547] nexthop: Unlink nexthop group entry in error path In case of error, remove the nexthop group entry from the list to which it was previously added. Fixes: 430a049190de ("nexthop: Add support for nexthop groups") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index f8035cfa9c20..712cdc061cde 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1459,8 +1459,10 @@ static struct nexthop *nexthop_create_group(struct net *net, return nh; out_no_nh: - for (i--; i >= 0; --i) + for (i--; i >= 0; --i) { + list_del(&nhg->nh_entries[i].nh_list); nexthop_put(nhg->nh_entries[i].nh); + } kfree(nhg->spare); kfree(nhg); From b19218b27f3477316d296e8bcf4446aaf017aa69 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 7 Jan 2021 16:48:23 +0200 Subject: [PATCH 515/547] nexthop: Bounce NHA_GATEWAY in FDB nexthop groups The function nh_check_attr_group() is called to validate nexthop groups. The intention of that code seems to have been to bounce all attributes above NHA_GROUP_TYPE except for NHA_FDB. However instead it bounces all these attributes except when NHA_FDB attribute is present--then it accepts them. NHA_FDB validation that takes place before, in rtm_to_nh_config(), already bounces NHA_OIF, NHA_BLACKHOLE, NHA_ENCAP and NHA_ENCAP_TYPE. Yet further back, NHA_GROUPS and NHA_MASTER are bounced unconditionally. But that still leaves NHA_GATEWAY as an attribute that would be accepted in FDB nexthop groups (with no meaning), so long as it keeps the address family as unspecified: # ip nexthop add id 1 fdb via 127.0.0.1 # ip nexthop add id 10 fdb via default group 1 The nexthop code is still relatively new and likely not used very broadly, and the FDB bits are newer still. Even though there is a reproducer out there, it relies on an improbable gateway arguments "via default", "via all" or "via any". Given all this, I believe it is OK to reformulate the condition to do the right thing and bounce NHA_GATEWAY. Fixes: 38428d68719c ("nexthop: support for fdb ecmp nexthops") Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 712cdc061cde..e53e43aef785 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -627,7 +627,7 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) { if (!tb[i]) continue; - if (tb[NHA_FDB]) + if (i == NHA_FDB) continue; NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); From a5c9ca76a1c61fb5e4c35de8eb25aa925b03c9e4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 7 Jan 2021 16:48:24 +0200 Subject: [PATCH 516/547] selftests: fib_nexthops: Fix wrong mausezahn invocation For IPv6 traffic, mausezahn needs to be invoked with '-6'. Otherwise an error is returned: # ip netns exec me mausezahn veth1 -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 -t tcp "dp=1-1023, flags=syn" Failed to set source IPv4 address. Please check if source is set to a valid IPv4 address. Invalid command line parameters! Fixes: 7c741868ceab ("selftests: Add torture tests to nexthop tests") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_nexthops.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index eb693a3b7b4a..4c7d33618437 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -869,7 +869,7 @@ ipv6_torture() pid3=$! ip netns exec me ping -f 2001:db8:101::2 >/dev/null 2>&1 & pid4=$! - ip netns exec me mausezahn veth1 -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & + ip netns exec me mausezahn -6 veth1 -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 & pid5=$! sleep 300 From 0b9902c1fcc59ba75268386c0420a554f8844168 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 7 Jan 2021 18:24:40 +0100 Subject: [PATCH 517/547] s390/qeth: fix deadlock during recovery When qeth_dev_layer2_store() - holding the discipline_mutex - waits inside qeth_l*_remove_device() for a qeth_do_reset() thread to complete, we can hit a deadlock if qeth_do_reset() concurrently calls qeth_set_online() and thus tries to aquire the discipline_mutex. Move the discipline_mutex locking outside of qeth_set_online() and qeth_set_offline(), and turn the discipline into a parameter so that callers understand the dependency. To fix the deadlock, we can now relax the locking: As already established, qeth_l*_remove_device() waits for qeth_do_reset() to complete. So qeth_do_reset() itself is under no risk of having card->discipline ripped out while it's running, and thus doesn't need to take the discipline_mutex. Fixes: 9dc48ccc68b9 ("qeth: serialize sysfs-triggered device configurations") Signed-off-by: Julian Wiedmann Reviewed-by: Alexandra Winter Signed-off-by: Jakub Kicinski --- drivers/s390/net/qeth_core.h | 3 ++- drivers/s390/net/qeth_core_main.c | 35 +++++++++++++++++++------------ drivers/s390/net/qeth_l2_main.c | 7 +++++-- drivers/s390/net/qeth_l3_main.c | 7 +++++-- 4 files changed, 34 insertions(+), 18 deletions(-) diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index 6f5ddc3eab8c..28f637042d44 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -1079,7 +1079,8 @@ struct qeth_card *qeth_get_card_by_busid(char *bus_id); void qeth_set_allowed_threads(struct qeth_card *card, unsigned long threads, int clear_start_mask); int qeth_threads_running(struct qeth_card *, unsigned long); -int qeth_set_offline(struct qeth_card *card, bool resetting); +int qeth_set_offline(struct qeth_card *card, const struct qeth_discipline *disc, + bool resetting); int qeth_send_ipa_cmd(struct qeth_card *, struct qeth_cmd_buffer *, int (*reply_cb) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index f4b60294a969..d45e223fc521 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -5507,12 +5507,12 @@ out: return rc; } -static int qeth_set_online(struct qeth_card *card) +static int qeth_set_online(struct qeth_card *card, + const struct qeth_discipline *disc) { bool carrier_ok; int rc; - mutex_lock(&card->discipline_mutex); mutex_lock(&card->conf_mutex); QETH_CARD_TEXT(card, 2, "setonlin"); @@ -5529,7 +5529,7 @@ static int qeth_set_online(struct qeth_card *card) /* no need for locking / error handling at this early stage: */ qeth_set_real_num_tx_queues(card, qeth_tx_actual_queues(card)); - rc = card->discipline->set_online(card, carrier_ok); + rc = disc->set_online(card, carrier_ok); if (rc) goto err_online; @@ -5537,7 +5537,6 @@ static int qeth_set_online(struct qeth_card *card) kobject_uevent(&card->gdev->dev.kobj, KOBJ_CHANGE); mutex_unlock(&card->conf_mutex); - mutex_unlock(&card->discipline_mutex); return 0; err_online: @@ -5552,15 +5551,14 @@ err_hardsetup: qdio_free(CARD_DDEV(card)); mutex_unlock(&card->conf_mutex); - mutex_unlock(&card->discipline_mutex); return rc; } -int qeth_set_offline(struct qeth_card *card, bool resetting) +int qeth_set_offline(struct qeth_card *card, const struct qeth_discipline *disc, + bool resetting) { int rc, rc2, rc3; - mutex_lock(&card->discipline_mutex); mutex_lock(&card->conf_mutex); QETH_CARD_TEXT(card, 3, "setoffl"); @@ -5581,7 +5579,7 @@ int qeth_set_offline(struct qeth_card *card, bool resetting) cancel_work_sync(&card->rx_mode_work); - card->discipline->set_offline(card); + disc->set_offline(card); qeth_qdio_clear_card(card, 0); qeth_drain_output_queues(card); @@ -5602,16 +5600,19 @@ int qeth_set_offline(struct qeth_card *card, bool resetting) kobject_uevent(&card->gdev->dev.kobj, KOBJ_CHANGE); mutex_unlock(&card->conf_mutex); - mutex_unlock(&card->discipline_mutex); return 0; } EXPORT_SYMBOL_GPL(qeth_set_offline); static int qeth_do_reset(void *data) { + const struct qeth_discipline *disc; struct qeth_card *card = data; int rc; + /* Lock-free, other users will block until we are done. */ + disc = card->discipline; + QETH_CARD_TEXT(card, 2, "recover1"); if (!qeth_do_run_thread(card, QETH_RECOVER_THREAD)) return 0; @@ -5619,8 +5620,8 @@ static int qeth_do_reset(void *data) dev_warn(&card->gdev->dev, "A recovery process has been started for the device\n"); - qeth_set_offline(card, true); - rc = qeth_set_online(card); + qeth_set_offline(card, disc, true); + rc = qeth_set_online(card, disc); if (!rc) { dev_info(&card->gdev->dev, "Device successfully recovered!\n"); @@ -6647,7 +6648,10 @@ static int qeth_core_set_online(struct ccwgroup_device *gdev) } } - rc = qeth_set_online(card); + mutex_lock(&card->discipline_mutex); + rc = qeth_set_online(card, card->discipline); + mutex_unlock(&card->discipline_mutex); + err: return rc; } @@ -6655,8 +6659,13 @@ err: static int qeth_core_set_offline(struct ccwgroup_device *gdev) { struct qeth_card *card = dev_get_drvdata(&gdev->dev); + int rc; - return qeth_set_offline(card, false); + mutex_lock(&card->discipline_mutex); + rc = qeth_set_offline(card, card->discipline, false); + mutex_unlock(&card->discipline_mutex); + + return rc; } static void qeth_core_shutdown(struct ccwgroup_device *gdev) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 4ed0fb0705a5..37279b1e29f6 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -2207,8 +2207,11 @@ static void qeth_l2_remove_device(struct ccwgroup_device *gdev) qeth_set_allowed_threads(card, 0, 1); wait_event(card->wait_q, qeth_threads_running(card, 0xffffffff) == 0); - if (gdev->state == CCWGROUP_ONLINE) - qeth_set_offline(card, false); + if (gdev->state == CCWGROUP_ONLINE) { + mutex_lock(&card->discipline_mutex); + qeth_set_offline(card, card->discipline, false); + mutex_unlock(&card->discipline_mutex); + } cancel_work_sync(&card->close_dev_work); if (card->dev->reg_state == NETREG_REGISTERED) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index d138ac432d01..8d474179ce98 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -1970,8 +1970,11 @@ static void qeth_l3_remove_device(struct ccwgroup_device *cgdev) qeth_set_allowed_threads(card, 0, 1); wait_event(card->wait_q, qeth_threads_running(card, 0xffffffff) == 0); - if (cgdev->state == CCWGROUP_ONLINE) - qeth_set_offline(card, false); + if (cgdev->state == CCWGROUP_ONLINE) { + mutex_lock(&card->discipline_mutex); + qeth_set_offline(card, card->discipline, false); + mutex_unlock(&card->discipline_mutex); + } cancel_work_sync(&card->close_dev_work); if (card->dev->reg_state == NETREG_REGISTERED) From b41b554c1ee75070a14c02a88496b1f231c7eacc Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 7 Jan 2021 18:24:41 +0100 Subject: [PATCH 518/547] s390/qeth: fix locking for discipline setup / removal Due to insufficient locking, qeth_core_set_online() and qeth_dev_layer2_store() can run in parallel, both attempting to load & setup the discipline (and stepping on each other toes along the way). A similar race can also occur between qeth_core_remove_device() and qeth_dev_layer2_store(). Access to .discipline is meant to be protected by the discipline_mutex, so add/expand the locking in qeth_core_remove_device() and qeth_core_set_online(). Adjust the locking in qeth_l*_remove_device() accordingly, as it's now handled by the callers in a consistent manner. Based on an initial patch by Ursula Braun. Fixes: 9dc48ccc68b9 ("qeth: serialize sysfs-triggered device configurations") Signed-off-by: Julian Wiedmann Reviewed-by: Alexandra Winter Signed-off-by: Jakub Kicinski --- drivers/s390/net/qeth_core_main.c | 7 +++++-- drivers/s390/net/qeth_l2_main.c | 5 +---- drivers/s390/net/qeth_l3_main.c | 5 +---- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index d45e223fc521..cf18d87da41e 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -6585,6 +6585,7 @@ static int qeth_core_probe_device(struct ccwgroup_device *gdev) break; default: card->info.layer_enforced = true; + /* It's so early that we don't need the discipline_mutex yet. */ rc = qeth_core_load_discipline(card, enforced_disc); if (rc) goto err_load; @@ -6617,10 +6618,12 @@ static void qeth_core_remove_device(struct ccwgroup_device *gdev) QETH_CARD_TEXT(card, 2, "removedv"); + mutex_lock(&card->discipline_mutex); if (card->discipline) { card->discipline->remove(gdev); qeth_core_free_discipline(card); } + mutex_unlock(&card->discipline_mutex); qeth_free_qdio_queues(card); @@ -6635,6 +6638,7 @@ static int qeth_core_set_online(struct ccwgroup_device *gdev) int rc = 0; enum qeth_discipline_id def_discipline; + mutex_lock(&card->discipline_mutex); if (!card->discipline) { def_discipline = IS_IQD(card) ? QETH_DISCIPLINE_LAYER3 : QETH_DISCIPLINE_LAYER2; @@ -6648,11 +6652,10 @@ static int qeth_core_set_online(struct ccwgroup_device *gdev) } } - mutex_lock(&card->discipline_mutex); rc = qeth_set_online(card, card->discipline); - mutex_unlock(&card->discipline_mutex); err: + mutex_unlock(&card->discipline_mutex); return rc; } diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 37279b1e29f6..4254caf1d9b6 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -2207,11 +2207,8 @@ static void qeth_l2_remove_device(struct ccwgroup_device *gdev) qeth_set_allowed_threads(card, 0, 1); wait_event(card->wait_q, qeth_threads_running(card, 0xffffffff) == 0); - if (gdev->state == CCWGROUP_ONLINE) { - mutex_lock(&card->discipline_mutex); + if (gdev->state == CCWGROUP_ONLINE) qeth_set_offline(card, card->discipline, false); - mutex_unlock(&card->discipline_mutex); - } cancel_work_sync(&card->close_dev_work); if (card->dev->reg_state == NETREG_REGISTERED) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 8d474179ce98..6970597bc885 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -1970,11 +1970,8 @@ static void qeth_l3_remove_device(struct ccwgroup_device *cgdev) qeth_set_allowed_threads(card, 0, 1); wait_event(card->wait_q, qeth_threads_running(card, 0xffffffff) == 0); - if (cgdev->state == CCWGROUP_ONLINE) { - mutex_lock(&card->discipline_mutex); + if (cgdev->state == CCWGROUP_ONLINE) qeth_set_offline(card, card->discipline, false); - mutex_unlock(&card->discipline_mutex); - } cancel_work_sync(&card->close_dev_work); if (card->dev->reg_state == NETREG_REGISTERED) From f9c4845385c8f6631ebd5dddfb019ea7a285fba4 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 7 Jan 2021 18:24:42 +0100 Subject: [PATCH 519/547] s390/qeth: fix L2 header access in qeth_l3_osa_features_check() ip_finish_output_gso() may call .ndo_features_check() even before the skb has a L2 header. This conflicts with qeth_get_ip_version()'s attempt to inspect the L2 header via vlan_eth_hdr(). Switch to vlan_get_protocol(), as already used further down in the common qeth_features_check() path. Fixes: f13ade199391 ("s390/qeth: run non-offload L3 traffic over common xmit path") Signed-off-by: Julian Wiedmann Signed-off-by: Jakub Kicinski --- drivers/s390/net/qeth_l3_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 6970597bc885..4c2cae7ae9a7 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -1813,7 +1813,7 @@ static netdev_features_t qeth_l3_osa_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { - if (qeth_get_ip_version(skb) != 4) + if (vlan_get_protocol(skb) != htons(ETH_P_IP)) features &= ~NETIF_F_HW_VLAN_CTAG_TX; return qeth_features_check(skb, dev, features); } From 3545454c7801e391b0d966f82c98614d45394770 Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Thu, 7 Jan 2021 20:58:18 +0100 Subject: [PATCH 520/547] net: dsa: lantiq_gswip: Exclude RMII from modes that report 1 GbE Exclude RMII from modes that report 1 GbE support. Reduced MII supports up to 100 MbE. Fixes: 14fceff4771e ("net: dsa: Add Lantiq / Intel DSA driver for vrx200") Signed-off-by: Aleksander Jan Bajkowski Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20210107195818.3878-1-olek2@wp.pl Signed-off-by: Jakub Kicinski --- drivers/net/dsa/lantiq_gswip.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 4b36d89bec06..662e68a0e7e6 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -1436,11 +1436,12 @@ static void gswip_phylink_validate(struct dsa_switch *ds, int port, phylink_set(mask, Pause); phylink_set(mask, Asym_Pause); - /* With the exclusion of MII and Reverse MII, we support Gigabit, - * including Half duplex + /* With the exclusion of MII, Reverse MII and Reduced MII, we + * support Gigabit, including Half duplex */ if (state->interface != PHY_INTERFACE_MODE_MII && - state->interface != PHY_INTERFACE_MODE_REVMII) { + state->interface != PHY_INTERFACE_MODE_REVMII && + state->interface != PHY_INTERFACE_MODE_RMII) { phylink_set(mask, 1000baseT_Full); phylink_set(mask, 1000baseT_Half); } From 2d2f6f1b4799428d160c021dd652bc3e3593945e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Jan 2021 19:36:40 +0100 Subject: [PATCH 521/547] block: pre-initialize struct block_device in bdev_alloc_inode bdev_evict_inode and bdev_free_inode are also called for the root inode of bdevfs, for which bdev_alloc is never called. Move the zeroing o f struct block_device and the initialization of the bd_bdi field into bdev_alloc_inode to make sure they are initialized for the root inode as well. Fixes: e6cb53827ed6 ("block: initialize struct block_device in bdev_alloc") Reported-by: Alexey Kardashevskiy Tested-by: Alexey Kardashevskiy Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index e454c5a81043..3b8963e228a1 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -776,8 +776,11 @@ static struct kmem_cache * bdev_cachep __read_mostly; static struct inode *bdev_alloc_inode(struct super_block *sb) { struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); + if (!ei) return NULL; + memset(&ei->bdev, 0, sizeof(ei->bdev)); + ei->bdev.bd_bdi = &noop_backing_dev_info; return &ei->vfs_inode; } @@ -871,14 +874,12 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) mapping_set_gfp_mask(&inode->i_data, GFP_USER); bdev = I_BDEV(inode); - memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); bdev->bd_disk = disk; bdev->bd_partno = partno; bdev->bd_inode = inode; - bdev->bd_bdi = &noop_backing_dev_info; #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif From ae28d1aae48a1258bd09a6f707ebb4231d79a761 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 17 Dec 2020 14:31:18 -0800 Subject: [PATCH 522/547] x86/resctrl: Use an IPI instead of task_work_add() to update PQR_ASSOC MSR Currently, when moving a task to a resource group the PQR_ASSOC MSR is updated with the new closid and rmid in an added task callback. If the task is running, the work is run as soon as possible. If the task is not running, the work is executed later in the kernel exit path when the kernel returns to the task again. Updating the PQR_ASSOC MSR as soon as possible on the CPU a moved task is running is the right thing to do. Queueing work for a task that is not running is unnecessary (the PQR_ASSOC MSR is already updated when the task is scheduled in) and causing system resource waste with the way in which it is implemented: Work to update the PQR_ASSOC register is queued every time the user writes a task id to the "tasks" file, even if the task already belongs to the resource group. This could result in multiple pending work items associated with a single task even if they are all identical and even though only a single update with most recent values is needed. Specifically, even if a task is moved between different resource groups while it is sleeping then it is only the last move that is relevant but yet a work item is queued during each move. This unnecessary queueing of work items could result in significant system resource waste, especially on tasks sleeping for a long time. For example, as demonstrated by Shakeel Butt in [1] writing the same task id to the "tasks" file can quickly consume significant memory. The same problem (wasted system resources) occurs when moving a task between different resource groups. As pointed out by Valentin Schneider in [2] there is an additional issue with the way in which the queueing of work is done in that the task_struct update is currently done after the work is queued, resulting in a race with the register update possibly done before the data needed by the update is available. To solve these issues, update the PQR_ASSOC MSR in a synchronous way right after the new closid and rmid are ready during the task movement, only if the task is running. If a moved task is not running nothing is done since the PQR_ASSOC MSR will be updated next time the task is scheduled. This is the same way used to update the register when tasks are moved as part of resource group removal. [1] https://lore.kernel.org/lkml/CALvZod7E9zzHwenzf7objzGKsdBmVwTgEJ0nPgs0LUFU3SN5Pw@mail.gmail.com/ [2] https://lore.kernel.org/lkml/20201123022433.17905-1-valentin.schneider@arm.com [ bp: Massage commit message and drop the two update_task_closid_rmid() variants. ] Fixes: e02737d5b826 ("x86/intel_rdt: Add tasks files") Reported-by: Shakeel Butt Reported-by: Valentin Schneider Signed-off-by: Fenghua Yu Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Reviewed-by: James Morse Reviewed-by: Valentin Schneider Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/17aa2fb38fc12ce7bb710106b3e7c7b45acb9e94.1608243147.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 112 ++++++++++--------------- 1 file changed, 43 insertions(+), 69 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 29ffb95b25ff..1c6f8a60ac52 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -525,89 +525,63 @@ static void rdtgroup_remove(struct rdtgroup *rdtgrp) kfree(rdtgrp); } -struct task_move_callback { - struct callback_head work; - struct rdtgroup *rdtgrp; -}; - -static void move_myself(struct callback_head *head) +static void _update_task_closid_rmid(void *task) { - struct task_move_callback *callback; - struct rdtgroup *rdtgrp; - - callback = container_of(head, struct task_move_callback, work); - rdtgrp = callback->rdtgrp; - /* - * If resource group was deleted before this task work callback - * was invoked, then assign the task to root group and free the - * resource group. + * If the task is still current on this CPU, update PQR_ASSOC MSR. + * Otherwise, the MSR is updated when the task is scheduled in. */ - if (atomic_dec_and_test(&rdtgrp->waitcount) && - (rdtgrp->flags & RDT_DELETED)) { - current->closid = 0; - current->rmid = 0; - rdtgroup_remove(rdtgrp); - } + if (task == current) + resctrl_sched_in(); +} - if (unlikely(current->flags & PF_EXITING)) - goto out; - - preempt_disable(); - /* update PQR_ASSOC MSR to make resource group go into effect */ - resctrl_sched_in(); - preempt_enable(); - -out: - kfree(callback); +static void update_task_closid_rmid(struct task_struct *t) +{ + if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) + smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); + else + _update_task_closid_rmid(t); } static int __rdtgroup_move_task(struct task_struct *tsk, struct rdtgroup *rdtgrp) { - struct task_move_callback *callback; - int ret; - - callback = kzalloc(sizeof(*callback), GFP_KERNEL); - if (!callback) - return -ENOMEM; - callback->work.func = move_myself; - callback->rdtgrp = rdtgrp; - /* - * Take a refcount, so rdtgrp cannot be freed before the - * callback has been invoked. + * Set the task's closid/rmid before the PQR_ASSOC MSR can be + * updated by them. + * + * For ctrl_mon groups, move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. */ - atomic_inc(&rdtgrp->waitcount); - ret = task_work_add(tsk, &callback->work, TWA_RESUME); - if (ret) { - /* - * Task is exiting. Drop the refcount and free the callback. - * No need to check the refcount as the group cannot be - * deleted before the write function unlocks rdtgroup_mutex. - */ - atomic_dec(&rdtgrp->waitcount); - kfree(callback); - rdt_last_cmd_puts("Task exited\n"); - } else { - /* - * For ctrl_mon groups move both closid and rmid. - * For monitor groups, can move the tasks only from - * their parent CTRL group. - */ - if (rdtgrp->type == RDTCTRL_GROUP) { - tsk->closid = rdtgrp->closid; + + if (rdtgrp->type == RDTCTRL_GROUP) { + tsk->closid = rdtgrp->closid; + tsk->rmid = rdtgrp->mon.rmid; + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) { tsk->rmid = rdtgrp->mon.rmid; - } else if (rdtgrp->type == RDTMON_GROUP) { - if (rdtgrp->mon.parent->closid == tsk->closid) { - tsk->rmid = rdtgrp->mon.rmid; - } else { - rdt_last_cmd_puts("Can't move task to different control group\n"); - ret = -EINVAL; - } + } else { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; } } - return ret; + + /* + * Ensure the task's closid and rmid are written before determining if + * the task is current that will decide if it will be interrupted. + */ + barrier(); + + /* + * By now, the task's closid and rmid are set. If the task is current + * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource + * group go into effect. If the task is not current, the MSR will be + * updated when the task is scheduled in. + */ + update_task_closid_rmid(tsk); + + return 0; } static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) From a0195f314a25582b38993bf30db11c300f4f4611 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 17 Dec 2020 14:31:19 -0800 Subject: [PATCH 523/547] x86/resctrl: Don't move a task to the same resource group Shakeel Butt reported in [1] that a user can request a task to be moved to a resource group even if the task is already in the group. It just wastes time to do the move operation which could be costly to send IPI to a different CPU. Add a sanity check to ensure that the move operation only happens when the task is not already in the resource group. [1] https://lore.kernel.org/lkml/CALvZod7E9zzHwenzf7objzGKsdBmVwTgEJ0nPgs0LUFU3SN5Pw@mail.gmail.com/ Fixes: e02737d5b826 ("x86/intel_rdt: Add tasks files") Reported-by: Shakeel Butt Signed-off-by: Fenghua Yu Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/962ede65d8e95be793cb61102cca37f7bb018e66.1608243147.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 1c6f8a60ac52..460f3e0df106 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -546,6 +546,13 @@ static void update_task_closid_rmid(struct task_struct *t) static int __rdtgroup_move_task(struct task_struct *tsk, struct rdtgroup *rdtgrp) { + /* If the task is already in rdtgrp, no need to move the task. */ + if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid && + tsk->rmid == rdtgrp->mon.rmid) || + (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid && + tsk->closid == rdtgrp->mon.parent->closid)) + return 0; + /* * Set the task's closid/rmid before the PQR_ASSOC MSR can be * updated by them. From 872f36eb0b0f4f0e3a81ea1e51a6bdf58ccfdc6e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 8 Jan 2021 05:54:44 -0500 Subject: [PATCH 524/547] KVM: x86: __kvm_vcpu_halt can be static Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0287840b93e0..a480804ae27a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7976,7 +7976,7 @@ void kvm_arch_exit(void) kmem_cache_destroy(x86_fpu_cache); } -int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) +static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) { ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { From 74acfa996b2aec2a4ea8587104c7e2f8d4c6aec2 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Fri, 8 Jan 2021 15:36:30 +0100 Subject: [PATCH 525/547] block/rnbd: Select SG_POOL for RNBD_CLIENT lkp reboot following build error: drivers/block/rnbd/rnbd-clt.c: In function 'rnbd_softirq_done_fn': >> drivers/block/rnbd/rnbd-clt.c:387:2: error: implicit declaration of function 'sg_free_table_chained' [-Werror=implicit-function-declaration] 387 | sg_free_table_chained(&iu->sgt, RNBD_INLINE_SG_CNT); | ^~~~~~~~~~~~~~~~~~~~~ The reason is CONFIG_SG_POOL is not enabled in the config, to avoid such failure, select SG_POOL in Kconfig for RNBD_CLIENT. Fixes: 5a1328d0c3a7 ("block/rnbd-clt: Dynamically allocate sglist for rnbd_iu") Reported-by: kernel test robot Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/rnbd/Kconfig b/drivers/block/rnbd/Kconfig index 4b6d3d816d1f..2ff05a0d2646 100644 --- a/drivers/block/rnbd/Kconfig +++ b/drivers/block/rnbd/Kconfig @@ -7,6 +7,7 @@ config BLK_DEV_RNBD_CLIENT tristate "RDMA Network Block Device driver client" depends on INFINIBAND_RTRS_CLIENT select BLK_DEV_RNBD + select SG_POOL help RNBD client is a network block device driver using rdma transport. From 1a84e7c629f8f288e02236bc799f9b0be1cab4a7 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Fri, 8 Jan 2021 15:36:31 +0100 Subject: [PATCH 526/547] block/rnbd-srv: Fix use after free in rnbd_srv_sess_dev_force_close KASAN detect following BUG: [ 778.215311] ================================================================== [ 778.216696] BUG: KASAN: use-after-free in rnbd_srv_sess_dev_force_close+0x38/0x60 [rnbd_server] [ 778.219037] Read of size 8 at addr ffff88b1d6516c28 by task tee/8842 [ 778.220500] CPU: 37 PID: 8842 Comm: tee Kdump: loaded Not tainted 5.10.0-pserver #5.10.0-1+feature+linux+next+20201214.1025+0910d71 [ 778.220529] Hardware name: Supermicro Super Server/X11DDW-L, BIOS 3.3 02/21/2020 [ 778.220555] Call Trace: [ 778.220609] dump_stack+0x99/0xcb [ 778.220667] ? rnbd_srv_sess_dev_force_close+0x38/0x60 [rnbd_server] [ 778.220715] print_address_description.constprop.7+0x1e/0x230 [ 778.220750] ? freeze_kernel_threads+0x73/0x73 [ 778.220896] ? rnbd_srv_sess_dev_force_close+0x38/0x60 [rnbd_server] [ 778.220932] ? rnbd_srv_sess_dev_force_close+0x38/0x60 [rnbd_server] [ 778.220994] kasan_report.cold.9+0x37/0x7c [ 778.221066] ? kobject_put+0x80/0x270 [ 778.221102] ? rnbd_srv_sess_dev_force_close+0x38/0x60 [rnbd_server] [ 778.221184] rnbd_srv_sess_dev_force_close+0x38/0x60 [rnbd_server] [ 778.221240] rnbd_srv_dev_session_force_close_store+0x6a/0xc0 [rnbd_server] [ 778.221304] ? sysfs_file_ops+0x90/0x90 [ 778.221353] kernfs_fop_write+0x141/0x240 [ 778.221451] vfs_write+0x142/0x4d0 [ 778.221553] ksys_write+0xc0/0x160 [ 778.221602] ? __ia32_sys_read+0x50/0x50 [ 778.221684] ? lockdep_hardirqs_on_prepare+0x13d/0x210 [ 778.221718] ? syscall_enter_from_user_mode+0x1c/0x50 [ 778.221821] do_syscall_64+0x33/0x40 [ 778.221862] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 778.221896] RIP: 0033:0x7f4affdd9504 [ 778.221928] Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b3 0f 1f 80 00 00 00 00 48 8d 05 f9 61 0d 00 8b 00 85 c0 75 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 41 54 49 89 d4 55 48 89 f5 53 [ 778.221956] RSP: 002b:00007fffebb36b28 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 778.222011] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f4affdd9504 [ 778.222038] RDX: 0000000000000002 RSI: 00007fffebb36c50 RDI: 0000000000000003 [ 778.222066] RBP: 00007fffebb36c50 R08: 0000556a151aa600 R09: 00007f4affeb1540 [ 778.222094] R10: fffffffffffffc19 R11: 0000000000000246 R12: 0000556a151aa520 [ 778.222121] R13: 0000000000000002 R14: 00007f4affea6760 R15: 0000000000000002 [ 778.222764] Allocated by task 3212: [ 778.223285] kasan_save_stack+0x19/0x40 [ 778.223316] __kasan_kmalloc.constprop.7+0xc1/0xd0 [ 778.223347] kmem_cache_alloc_trace+0x186/0x350 [ 778.223382] rnbd_srv_rdma_ev+0xf16/0x1690 [rnbd_server] [ 778.223422] process_io_req+0x4d1/0x670 [rtrs_server] [ 778.223573] __ib_process_cq+0x10a/0x350 [ib_core] [ 778.223709] ib_cq_poll_work+0x31/0xb0 [ib_core] [ 778.223743] process_one_work+0x521/0xa90 [ 778.223773] worker_thread+0x65/0x5b0 [ 778.223802] kthread+0x1f2/0x210 [ 778.223833] ret_from_fork+0x22/0x30 [ 778.224296] Freed by task 8842: [ 778.224800] kasan_save_stack+0x19/0x40 [ 778.224829] kasan_set_track+0x1c/0x30 [ 778.224860] kasan_set_free_info+0x1b/0x30 [ 778.224889] __kasan_slab_free+0x108/0x150 [ 778.224919] slab_free_freelist_hook+0x64/0x190 [ 778.224947] kfree+0xe2/0x650 [ 778.224982] rnbd_destroy_sess_dev+0x2fa/0x3b0 [rnbd_server] [ 778.225011] kobject_put+0xda/0x270 [ 778.225046] rnbd_srv_sess_dev_force_close+0x30/0x60 [rnbd_server] [ 778.225081] rnbd_srv_dev_session_force_close_store+0x6a/0xc0 [rnbd_server] [ 778.225111] kernfs_fop_write+0x141/0x240 [ 778.225140] vfs_write+0x142/0x4d0 [ 778.225169] ksys_write+0xc0/0x160 [ 778.225198] do_syscall_64+0x33/0x40 [ 778.225227] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 778.226506] The buggy address belongs to the object at ffff88b1d6516c00 which belongs to the cache kmalloc-512 of size 512 [ 778.227464] The buggy address is located 40 bytes inside of 512-byte region [ffff88b1d6516c00, ffff88b1d6516e00) The problem is in the sess_dev release function we call rnbd_destroy_sess_dev, and could free the sess_dev already, but we still set the keep_id in rnbd_srv_sess_dev_force_close, which lead to use after free. To fix it, move the keep_id before the sysfs removal, and cache the rnbd_srv_session for lock accessing, Fixes: 786998050cbc ("block/rnbd-srv: close a mapped device from server side.") Signed-off-by: Jack Wang Reviewed-by: Guoqing Jiang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index b8e44331e494..a6a68d44f517 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -338,10 +338,12 @@ static int rnbd_srv_link_ev(struct rtrs_srv *rtrs, void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev) { - mutex_lock(&sess_dev->sess->lock); - rnbd_srv_destroy_dev_session_sysfs(sess_dev); - mutex_unlock(&sess_dev->sess->lock); + struct rnbd_srv_session *sess = sess_dev->sess; + sess_dev->keep_id = true; + mutex_lock(&sess->lock); + rnbd_srv_destroy_dev_session_sysfs(sess_dev); + mutex_unlock(&sess->lock); } static int process_msg_close(struct rtrs_srv *rtrs, From 80f99093d81370c5cec37fca3b5a6bdf6bddf0f6 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 8 Jan 2021 15:36:32 +0100 Subject: [PATCH 527/547] block/rnbd-clt: Fix sg table use after free Since dynamically allocate sglist is used for rnbd_iu, we can't free sg table after send_usr_msg since the callback function (cqe.done) could still access the sglist. Otherwise KASAN reports UAF issue: [ 4856.600257] BUG: KASAN: use-after-free in dma_direct_unmap_sg+0x53/0x290 [ 4856.600772] Read of size 4 at addr ffff888206af3a98 by task swapper/1/0 [ 4856.601729] CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G W 5.10.0-pserver #5.10.0-1+feature+linux+next+20201214.1025+0910d71 [ 4856.601748] Hardware name: Supermicro Super Server/X11DDW-L, BIOS 3.3 02/21/2020 [ 4856.601766] Call Trace: [ 4856.601785] [ 4856.601822] dump_stack+0x99/0xcb [ 4856.601856] ? dma_direct_unmap_sg+0x53/0x290 [ 4856.601888] print_address_description.constprop.7+0x1e/0x230 [ 4856.601913] ? freeze_kernel_threads+0x73/0x73 [ 4856.601965] ? mark_held_locks+0x29/0xa0 [ 4856.602019] ? dma_direct_unmap_sg+0x53/0x290 [ 4856.602039] ? dma_direct_unmap_sg+0x53/0x290 [ 4856.602079] kasan_report.cold.9+0x37/0x7c [ 4856.602188] ? mlx5_ib_post_recv+0x430/0x520 [mlx5_ib] [ 4856.602209] ? dma_direct_unmap_sg+0x53/0x290 [ 4856.602256] dma_direct_unmap_sg+0x53/0x290 [ 4856.602366] complete_rdma_req+0x188/0x4b0 [rtrs_client] [ 4856.602451] ? rtrs_clt_close+0x80/0x80 [rtrs_client] [ 4856.602535] ? mlx5_ib_poll_cq+0x48b/0x16e0 [mlx5_ib] [ 4856.602589] ? radix_tree_insert+0x3a0/0x3a0 [ 4856.602610] ? do_raw_spin_lock+0x119/0x1d0 [ 4856.602647] ? rwlock_bug.part.1+0x60/0x60 [ 4856.602740] rtrs_clt_rdma_done+0x3f7/0x670 [rtrs_client] [ 4856.602804] ? rtrs_clt_rdma_cm_handler+0xda0/0xda0 [rtrs_client] [ 4856.602857] ? check_flags.part.31+0x6c/0x1f0 [ 4856.602927] ? rcu_read_lock_sched_held+0xaf/0xe0 [ 4856.602963] ? rcu_read_lock_bh_held+0xc0/0xc0 [ 4856.603137] __ib_process_cq+0x10a/0x350 [ib_core] [ 4856.603309] ib_poll_handler+0x41/0x1c0 [ib_core] [ 4856.603358] irq_poll_softirq+0xe6/0x280 [ 4856.603392] ? lockdep_hardirqs_on_prepare+0x111/0x210 [ 4856.603446] __do_softirq+0x10d/0x646 [ 4856.603540] asm_call_irq_on_stack+0x12/0x20 [ 4856.603563] [ 4856.605096] Allocated by task 8914: [ 4856.605510] kasan_save_stack+0x19/0x40 [ 4856.605532] __kasan_kmalloc.constprop.7+0xc1/0xd0 [ 4856.605552] __kmalloc+0x155/0x320 [ 4856.605574] __sg_alloc_table+0x155/0x1c0 [ 4856.605594] sg_alloc_table+0x1f/0x50 [ 4856.605620] send_msg_sess_info+0x119/0x2e0 [rnbd_client] [ 4856.605646] remap_devs+0x71/0x210 [rnbd_client] [ 4856.605676] init_sess+0xad8/0xe10 [rtrs_client] [ 4856.605706] rtrs_clt_reconnect_work+0xd6/0x170 [rtrs_client] [ 4856.605728] process_one_work+0x521/0xa90 [ 4856.605748] worker_thread+0x65/0x5b0 [ 4856.605769] kthread+0x1f2/0x210 [ 4856.605789] ret_from_fork+0x22/0x30 [ 4856.606159] Freed by task 8914: [ 4856.606559] kasan_save_stack+0x19/0x40 [ 4856.606580] kasan_set_track+0x1c/0x30 [ 4856.606601] kasan_set_free_info+0x1b/0x30 [ 4856.606622] __kasan_slab_free+0x108/0x150 [ 4856.606642] slab_free_freelist_hook+0x64/0x190 [ 4856.606661] kfree+0xe2/0x650 [ 4856.606681] __sg_free_table+0xa4/0x100 [ 4856.606707] send_msg_sess_info+0x1d6/0x2e0 [rnbd_client] [ 4856.606733] remap_devs+0x71/0x210 [rnbd_client] [ 4856.606763] init_sess+0xad8/0xe10 [rtrs_client] [ 4856.606792] rtrs_clt_reconnect_work+0xd6/0x170 [rtrs_client] [ 4856.606813] process_one_work+0x521/0xa90 [ 4856.606833] worker_thread+0x65/0x5b0 [ 4856.606853] kthread+0x1f2/0x210 [ 4856.606872] ret_from_fork+0x22/0x30 The solution is to free iu's sgtable after the iu is not used anymore. And also move sg_alloc_table into rnbd_get_iu accordingly. Fixes: 5a1328d0c3a7 ("block/rnbd-clt: Dynamically allocate sglist for rnbd_iu") Signed-off-by: Guoqing Jiang Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 96e3f9fe8241..506e9094487f 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -375,12 +375,19 @@ static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess, init_waitqueue_head(&iu->comp.wait); iu->comp.errno = INT_MAX; + if (sg_alloc_table(&iu->sgt, 1, GFP_KERNEL)) { + rnbd_put_permit(sess, permit); + kfree(iu); + return NULL; + } + return iu; } static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu) { if (atomic_dec_and_test(&iu->refcount)) { + sg_free_table(&iu->sgt); rnbd_put_permit(sess, iu->permit); kfree(iu); } @@ -487,8 +494,6 @@ static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait) iu->buf = NULL; iu->dev = dev; - sg_alloc_table(&iu->sgt, 1, GFP_KERNEL); - msg.hdr.type = cpu_to_le16(RNBD_MSG_CLOSE); msg.device_id = cpu_to_le32(device_id); @@ -502,7 +507,6 @@ static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait) err = errno; } - sg_free_table(&iu->sgt); rnbd_put_iu(sess, iu); return err; } @@ -575,7 +579,6 @@ static int send_msg_open(struct rnbd_clt_dev *dev, bool wait) iu->buf = rsp; iu->dev = dev; - sg_alloc_table(&iu->sgt, 1, GFP_KERNEL); sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp)); msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN); @@ -594,7 +597,6 @@ static int send_msg_open(struct rnbd_clt_dev *dev, bool wait) err = errno; } - sg_free_table(&iu->sgt); rnbd_put_iu(sess, iu); return err; } @@ -622,8 +624,6 @@ static int send_msg_sess_info(struct rnbd_clt_session *sess, bool wait) iu->buf = rsp; iu->sess = sess; - - sg_alloc_table(&iu->sgt, 1, GFP_KERNEL); sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp)); msg.hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO); @@ -650,7 +650,6 @@ put_iu: } else { err = errno; } - sg_free_table(&iu->sgt); rnbd_put_iu(sess, iu); return err; } From ef8048dd2345d070c41bc7df16763fd4d8fac296 Mon Sep 17 00:00:00 2001 From: Swapnil Ingle Date: Fri, 8 Jan 2021 15:36:33 +0100 Subject: [PATCH 528/547] block/rnbd: Adding name to the Contributors List Adding name to the Contributors List Signed-off-by: Swapnil Ingle Acked-by: Jack Wang Acked-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/README | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/rnbd/README b/drivers/block/rnbd/README index 1773c0aa0bd4..080f58a5400a 100644 --- a/drivers/block/rnbd/README +++ b/drivers/block/rnbd/README @@ -90,3 +90,4 @@ Kleber Souza Lutz Pogrell Milind Dumbare Roman Penyaev +Swapnil Ingle From 3a21777c6ee99749bac10727b3c17e5bcfebe5c1 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Fri, 8 Jan 2021 15:36:34 +0100 Subject: [PATCH 529/547] block/rnbd-clt: avoid module unload race with close confirmation We had kernel panic, it is caused by unload module and last close confirmation. call trace: [1196029.743127] free_sess+0x15/0x50 [rtrs_client] [1196029.743128] rtrs_clt_close+0x4c/0x70 [rtrs_client] [1196029.743129] ? rnbd_clt_unmap_device+0x1b0/0x1b0 [rnbd_client] [1196029.743130] close_rtrs+0x25/0x50 [rnbd_client] [1196029.743131] rnbd_client_exit+0x93/0xb99 [rnbd_client] [1196029.743132] __x64_sys_delete_module+0x190/0x260 And in the crashdump confirmation kworker is also running. PID: 6943 TASK: ffff9e2ac8098000 CPU: 4 COMMAND: "kworker/4:2" #0 [ffffb206cf337c30] __schedule at ffffffff9f93f891 #1 [ffffb206cf337cc8] schedule at ffffffff9f93fe98 #2 [ffffb206cf337cd0] schedule_timeout at ffffffff9f943938 #3 [ffffb206cf337d50] wait_for_completion at ffffffff9f9410a7 #4 [ffffb206cf337da0] __flush_work at ffffffff9f08ce0e #5 [ffffb206cf337e20] rtrs_clt_close_conns at ffffffffc0d5f668 [rtrs_client] #6 [ffffb206cf337e48] rtrs_clt_close at ffffffffc0d5f801 [rtrs_client] #7 [ffffb206cf337e68] close_rtrs at ffffffffc0d26255 [rnbd_client] #8 [ffffb206cf337e78] free_sess at ffffffffc0d262ad [rnbd_client] #9 [ffffb206cf337e88] rnbd_clt_put_dev at ffffffffc0d266a7 [rnbd_client] The problem is both code path try to close same session, which lead to panic. To fix it, just skip the sess if the refcount already drop to 0. Fixes: f7a7a5c228d4 ("block/rnbd: client: main functionality") Signed-off-by: Jack Wang Reviewed-by: Gioh Kim Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 506e9094487f..45a470076652 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1697,7 +1697,8 @@ static void rnbd_destroy_sessions(void) */ list_for_each_entry_safe(sess, sn, &sess_list, list) { - WARN_ON(!rnbd_clt_get_sess(sess)); + if (!rnbd_clt_get_sess(sess)) + continue; close_rtrs(sess); list_for_each_entry_safe(dev, tn, &sess->devs_list, list) { /* From 02f938e9fed1681791605ca8b96c2d9da9355f6a Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 8 Jan 2021 16:55:37 +0800 Subject: [PATCH 530/547] blk-mq-debugfs: Add decode for BLK_MQ_F_TAG_HCTX_SHARED Showing the hctx flags for when BLK_MQ_F_TAG_HCTX_SHARED is set gives something like: root@debian:/home/john# more /sys/kernel/debug/block/sda/hctx0/flags alloc_policy=FIFO SHOULD_MERGE|TAG_QUEUE_SHARED|3 Add the decoding for that flag. Fixes: 32bc15afed04b ("blk-mq: Facilitate a shared sbitmap per tagset") Signed-off-by: John Garry Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index e21eed20a155..712a95f74ccb 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -246,6 +246,7 @@ static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(NO_SCHED), HCTX_FLAG_NAME(STACKING), + HCTX_FLAG_NAME(TAG_HCTX_SHARED), }; #undef HCTX_FLAG_NAME From bac717171971176b78c72d15a8b6961764ab197f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 Dec 2020 16:20:05 +0100 Subject: [PATCH 531/547] ARM: picoxcell: fix missing interrupt-parent properties dtc points out that the interrupts for some devices are not parsable: picoxcell-pc3x2.dtsi:45.19-49.5: Warning (interrupts_property): /paxi/gem@30000: Missing interrupt-parent picoxcell-pc3x2.dtsi:51.21-55.5: Warning (interrupts_property): /paxi/dmac@40000: Missing interrupt-parent picoxcell-pc3x2.dtsi:57.21-61.5: Warning (interrupts_property): /paxi/dmac@50000: Missing interrupt-parent picoxcell-pc3x2.dtsi:233.21-237.5: Warning (interrupts_property): /rwid-axi/axi2pico@c0000000: Missing interrupt-parent There are two VIC instances, so it's not clear which one needs to be used. I found the BSP sources that reference VIC0, so use that: https://github.com/r1mikey/meta-picoxcell/blob/master/recipes-kernel/linux/linux-picochip-3.0/0001-picoxcell-support-for-Picochip-picoXcell-SoC.patch Acked-by: Jamie Iles Link: https://lore.kernel.org/r/20201230152010.3914962-1-arnd@kernel.org' Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/picoxcell-pc3x2.dtsi | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm/boot/dts/picoxcell-pc3x2.dtsi b/arch/arm/boot/dts/picoxcell-pc3x2.dtsi index c4c6c7e9e37b..5898879a3038 100644 --- a/arch/arm/boot/dts/picoxcell-pc3x2.dtsi +++ b/arch/arm/boot/dts/picoxcell-pc3x2.dtsi @@ -45,18 +45,21 @@ emac: gem@30000 { compatible = "cadence,gem"; reg = <0x30000 0x10000>; + interrupt-parent = <&vic0>; interrupts = <31>; }; dmac1: dmac@40000 { compatible = "snps,dw-dmac"; reg = <0x40000 0x10000>; + interrupt-parent = <&vic0>; interrupts = <25>; }; dmac2: dmac@50000 { compatible = "snps,dw-dmac"; reg = <0x50000 0x10000>; + interrupt-parent = <&vic0>; interrupts = <26>; }; @@ -233,6 +236,7 @@ axi2pico@c0000000 { compatible = "picochip,axi2pico-pc3x2"; reg = <0xc0000000 0x10000>; + interrupt-parent = <&vic0>; interrupts = <13 14 15 16 17 18 19 20 21>; }; }; From 84e261553e6f919bf0b4d65244599ab2b41f1da5 Mon Sep 17 00:00:00 2001 From: David Arcari Date: Thu, 7 Jan 2021 09:47:07 -0500 Subject: [PATCH 532/547] hwmon: (amd_energy) fix allocation of hwmon_channel_info config hwmon, specifically hwmon_num_channel_attrs, expects the config array in the hwmon_channel_info structure to be terminated by a zero entry. amd_energy does not honor this convention. As result, a KASAN warning is possible. Fix this by adding an additional entry and setting it to zero. Fixes: 8abee9566b7e ("hwmon: Add amd_energy driver to report energy counters") Signed-off-by: David Arcari Cc: Naveen Krishna Chatradhi Cc: Jean Delvare Cc: Guenter Roeck Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: David Arcari Acked-by: Naveen Krishna Chatradhi Link: https://lore.kernel.org/r/20210107144707.6927-1-darcari@redhat.com Signed-off-by: Guenter Roeck --- drivers/hwmon/amd_energy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/amd_energy.c b/drivers/hwmon/amd_energy.c index 9b306448b7a0..822c2e74b98d 100644 --- a/drivers/hwmon/amd_energy.c +++ b/drivers/hwmon/amd_energy.c @@ -222,7 +222,7 @@ static int amd_create_sensor(struct device *dev, */ cpus = num_present_cpus() / num_siblings; - s_config = devm_kcalloc(dev, cpus + sockets, + s_config = devm_kcalloc(dev, cpus + sockets + 1, sizeof(u32), GFP_KERNEL); if (!s_config) return -ENOMEM; @@ -254,6 +254,7 @@ static int amd_create_sensor(struct device *dev, scnprintf(label_l[i], 10, "Esocket%u", (i - cpus)); } + s_config[i] = 0; return 0; } From a91bd6223ecd46addc71ee6fcd432206d39365d2 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 8 Jan 2021 12:48:47 +0100 Subject: [PATCH 533/547] Revert "init/console: Use ttynull as a fallback when there is no console" This reverts commit 757055ae8dedf5333af17b3b5b4b70ba9bc9da4e. The commit caused that ttynull was used as the default console on several systems[1][2][3]. As a result, the console was blank even when a better alternative existed. It happened when there was no console configured on the command line and ttynull_init() was the first initcall calling register_console(). Or it happened when /dev/ did not exist when console_on_rootfs() was called. It was not able to open /dev/console even though a console driver was registered. It tried to add ttynull console but it obviously did not help. But ttynull became the preferred console and was used by /dev/console when it was available later. The commit tried to fix a historical problem that have been there for ages. The primary motivation was the commit 3cffa06aeef7ece30f6 ("printk/console: Allow to disable console output by using console="" or console=null"). It provided a clean solution for a workaround that was widely used and worked only by chance. This revert causes that the console="" or console=null command line options will again work only by chance. These options will cause that a particular console will be preferred and the default (tty) ones will not get enabled. There will be no console registered at all. As a result there won't be stdin, stdout, and stderr for the init process. But it worked exactly this way even before. The proper solution has to fulfill many conditions: + Register ttynull only when explicitly required or as the ultimate fallback. + ttynull should get associated with /dev/console but it must not become preferred console when used as a fallback. Especially, it must still be possible to replace it by a better console later. Such a change requires clean up of the register_console() code. Otherwise, it would be even harder to follow. Especially, the use of has_preferred_console and CON_CONSDEV flag is tricky. The clean up is risky. The ordering of consoles is not well defined. And any changes tend to break existing user settings. Do the revert at the least risky solution for now. [1] https://lore.kernel.org/linux-kselftest/20201221144302.GR4077@smile.fi.intel.com/ [2] https://lore.kernel.org/lkml/d2a3b3c0-e548-7dd1-730f-59bc5c04e191@synopsys.com/ [3] https://patchwork.ozlabs.org/project/linux-um/patch/20210105120128.10854-1-thomas@m3y3r.de/ Reported-by: Andy Shevchenko Reported-by: Vineet Gupta Reported-by: Thomas Meyer Signed-off-by: Petr Mladek Acked-by: Greg Kroah-Hartman Acked-by: Sergey Senozhatsky Signed-off-by: Linus Torvalds --- drivers/tty/Kconfig | 14 ++++++++++++++ drivers/tty/Makefile | 3 ++- drivers/tty/ttynull.c | 18 ------------------ include/linux/console.h | 3 --- init/main.c | 10 ++-------- 5 files changed, 18 insertions(+), 30 deletions(-) diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig index 47a6e42f0d04..e15cd6b5bb99 100644 --- a/drivers/tty/Kconfig +++ b/drivers/tty/Kconfig @@ -401,6 +401,20 @@ config MIPS_EJTAG_FDC_KGDB_CHAN help FDC channel number to use for KGDB. +config NULL_TTY + tristate "NULL TTY driver" + help + Say Y here if you want a NULL TTY which simply discards messages. + + This is useful to allow userspace applications which expect a console + device to work without modifications even when no console is + available or desired. + + In order to use this driver, you should redirect the console to this + TTY, or boot the kernel with console=ttynull. + + If unsure, say N. + config TRACE_ROUTER tristate "Trace data router for MIPI P1149.7 cJTAG standard" depends on TRACE_SINK diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile index 3c1c5a9240a7..b3ccae932660 100644 --- a/drivers/tty/Makefile +++ b/drivers/tty/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_TTY) += tty_io.o n_tty.o tty_ioctl.o tty_ldisc.o \ tty_buffer.o tty_port.o tty_mutex.o \ tty_ldsem.o tty_baudrate.o tty_jobctrl.o \ - n_null.o ttynull.o + n_null.o obj-$(CONFIG_LEGACY_PTYS) += pty.o obj-$(CONFIG_UNIX98_PTYS) += pty.o obj-$(CONFIG_AUDIT) += tty_audit.o @@ -25,6 +25,7 @@ obj-$(CONFIG_ISI) += isicom.o obj-$(CONFIG_MOXA_INTELLIO) += moxa.o obj-$(CONFIG_MOXA_SMARTIO) += mxser.o obj-$(CONFIG_NOZOMI) += nozomi.o +obj-$(CONFIG_NULL_TTY) += ttynull.o obj-$(CONFIG_ROCKETPORT) += rocket.o obj-$(CONFIG_SYNCLINK_GT) += synclink_gt.o obj-$(CONFIG_PPC_EPAPR_HV_BYTECHAN) += ehv_bytechan.o diff --git a/drivers/tty/ttynull.c b/drivers/tty/ttynull.c index eced70ec54e1..17f05b7eb6d3 100644 --- a/drivers/tty/ttynull.c +++ b/drivers/tty/ttynull.c @@ -2,13 +2,6 @@ /* * Copyright (C) 2019 Axis Communications AB * - * The console is useful for userspace applications which expect a console - * device to work without modifications even when no console is available - * or desired. - * - * In order to use this driver, you should redirect the console to this - * TTY, or boot the kernel with console=ttynull. - * * Based on ttyprintk.c: * Copyright (C) 2010 Samo Pogacnik */ @@ -66,17 +59,6 @@ static struct console ttynull_console = { .device = ttynull_device, }; -void __init register_ttynull_console(void) -{ - if (!ttynull_driver) - return; - - if (add_preferred_console(ttynull_console.name, 0, NULL)) - return; - - register_console(&ttynull_console); -} - static int __init ttynull_init(void) { struct tty_driver *driver; diff --git a/include/linux/console.h b/include/linux/console.h index dbe78e8e2602..20874db50bc8 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -186,12 +186,9 @@ extern int braille_register_console(struct console *, int index, extern int braille_unregister_console(struct console *); #ifdef CONFIG_TTY extern void console_sysfs_notify(void); -extern void register_ttynull_console(void); #else static inline void console_sysfs_notify(void) { } -static inline void register_ttynull_console(void) -{ } #endif extern bool console_suspend_enabled; diff --git a/init/main.c b/init/main.c index 421640fca375..c68d784376ca 100644 --- a/init/main.c +++ b/init/main.c @@ -1480,14 +1480,8 @@ void __init console_on_rootfs(void) struct file *file = filp_open("/dev/console", O_RDWR, 0); if (IS_ERR(file)) { - pr_err("Warning: unable to open an initial console. Fallback to ttynull.\n"); - register_ttynull_console(); - - file = filp_open("/dev/console", O_RDWR, 0); - if (IS_ERR(file)) { - pr_err("Warning: Failed to add ttynull console. No stdin, stdout, and stderr for the init process!\n"); - return; - } + pr_err("Warning: unable to open an initial console.\n"); + return; } init_dup(file); init_dup(file); From ef0ba05538299f1391cbe097de36895bb36ecfe6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 7 Jan 2021 09:43:54 -0800 Subject: [PATCH 534/547] poll: fix performance regression due to out-of-line __put_user() The kernel test robot reported a -5.8% performance regression on the "poll2" test of will-it-scale, and bisected it to commit d55564cfc222 ("x86: Make __put_user() generate an out-of-line call"). I didn't expect an out-of-line __put_user() to matter, because no normal core code should use that non-checking legacy version of user access any more. But I had overlooked the very odd poll() usage, which does a __put_user() to update the 'revents' values of the poll array. Now, Al Viro correctly points out that instead of updating just the 'revents' field, it would be much simpler to just copy the _whole_ pollfd entry, and then we could just use "copy_to_user()" on the whole array of entries, the same way we use "copy_from_user()" a few lines earlier to get the original values. But that is not what we've traditionally done, and I worry that threaded applications might be concurrently modifying the other fields of the pollfd array. So while Al's suggestion is simpler - and perhaps worth trying in the future - this instead keeps the "just update revents" model. To fix the performance regression, use the modern "unsafe_put_user()" instead of __put_user(), with the proper "user_write_access_begin()" guarding in place. This improves code generation enormously. Link: https://lore.kernel.org/lkml/20210107134723.GA28532@xsang-OptiPlex-9020/ Reported-by: kernel test robot Tested-by: Oliver Sang Cc: Al Viro Cc: David Laight Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- fs/select.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/select.c b/fs/select.c index ebfebdfe5c69..37aaa8317f3a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1011,14 +1011,17 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, fdcount = do_poll(head, &table, end_time); poll_freewait(&table); + if (!user_write_access_begin(ufds, nfds * sizeof(*ufds))) + goto out_fds; + for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; int j; - for (j = 0; j < walk->len; j++, ufds++) - if (__put_user(fds[j].revents, &ufds->revents)) - goto out_fds; + for (j = walk->len; j; fds++, ufds++, j--) + unsafe_put_user(fds->revents, &ufds->revents, Efault); } + user_write_access_end(); err = fdcount; out_fds: @@ -1030,6 +1033,11 @@ out_fds: } return err; + +Efault: + user_write_access_end(); + err = -EFAULT; + goto out_fds; } static long do_restart_poll(struct restart_block *restart_block) From e8deee4f1543eda9b75278f63322f412cad52f6a Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 8 Jan 2021 13:46:55 -0800 Subject: [PATCH 535/547] ARC: [hsdk]: Enable FPU_SAVE_RESTORE HSDK has hardware floating point and the common use case is with glibc+hf so enable that as default. Signed-off-by: Vineet Gupta --- arch/arc/plat-hsdk/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arc/plat-hsdk/Kconfig b/arch/arc/plat-hsdk/Kconfig index 6b5c54576f54..a2d10c29fbcc 100644 --- a/arch/arc/plat-hsdk/Kconfig +++ b/arch/arc/plat-hsdk/Kconfig @@ -7,6 +7,7 @@ menuconfig ARC_SOC_HSDK depends on ISA_ARCV2 select ARC_HAS_ACCL_REGS select ARC_IRQ_NO_AUTOSAVE + select ARC_FPU_SAVE_RESTORE select CLK_HSDK select RESET_CONTROLLER select RESET_HSDK From e80927079fd97b4d5457e3af2400a0087b561564 Mon Sep 17 00:00:00 2001 From: Yi Li Date: Mon, 4 Jan 2021 15:41:18 +0800 Subject: [PATCH 536/547] bcache: set pdev_set_uuid before scond loop iteration There is no need to reassign pdev_set_uuid in the second loop iteration, so move it to the place before second loop. Signed-off-by: Yi Li Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a4752ac410dc..6aa23a6fb394 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2644,8 +2644,8 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, } list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { + char *pdev_set_uuid = pdev->dc->sb.set_uuid; list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { - char *pdev_set_uuid = pdev->dc->sb.set_uuid; char *set_uuid = c->set_uuid; if (!memcmp(pdev_set_uuid, set_uuid, 16)) { From f7b4943dea48a572ad751ce1f18a245d43debe7e Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 4 Jan 2021 15:41:19 +0800 Subject: [PATCH 537/547] bcache: fix typo from SUUP to SUPP in features.h This patch fixes the following typos, from BCH_FEATURE_COMPAT_SUUP to BCH_FEATURE_COMPAT_SUPP from BCH_FEATURE_INCOMPAT_SUUP to BCH_FEATURE_INCOMPAT_SUPP from BCH_FEATURE_INCOMPAT_SUUP to BCH_FEATURE_RO_COMPAT_SUPP Fixes: d721a43ff69c ("bcache: increase super block version for cache device and backing device") Fixes: ffa470327572 ("bcache: add bucket_size_hi into struct cache_sb_disk for large bucket") Signed-off-by: Coly Li Cc: stable@vger.kernel.org # 5.9+ Signed-off-by: Jens Axboe --- drivers/md/bcache/features.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h index a1653c478041..32c5bbda2f0d 100644 --- a/drivers/md/bcache/features.h +++ b/drivers/md/bcache/features.h @@ -15,9 +15,9 @@ /* Incompat feature set */ #define BCH_FEATURE_INCOMPAT_LARGE_BUCKET 0x0001 /* 32bit bucket size */ -#define BCH_FEATURE_COMPAT_SUUP 0 -#define BCH_FEATURE_RO_COMPAT_SUUP 0 -#define BCH_FEATURE_INCOMPAT_SUUP BCH_FEATURE_INCOMPAT_LARGE_BUCKET +#define BCH_FEATURE_COMPAT_SUPP 0 +#define BCH_FEATURE_RO_COMPAT_SUPP 0 +#define BCH_FEATURE_INCOMPAT_SUPP BCH_FEATURE_INCOMPAT_LARGE_BUCKET #define BCH_HAS_COMPAT_FEATURE(sb, mask) \ ((sb)->feature_compat & (mask)) From 1dfc0686c29a9bbd3a446a29f9ccde3dec3bc75a Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 4 Jan 2021 15:41:20 +0800 Subject: [PATCH 538/547] bcache: check unsupported feature sets for bcache register This patch adds the check for features which is incompatible for current supported feature sets. Now if the bcache device created by bcache-tools has features that current kernel doesn't support, read_super() will fail with error messoage. E.g. if an unsupported incompatible feature detected, bcache register will fail with dmesg "bcache: register_bcache() error : Unsupported incompatible feature found". Fixes: d721a43ff69c ("bcache: increase super block version for cache device and backing device") Fixes: ffa470327572 ("bcache: add bucket_size_hi into struct cache_sb_disk for large bucket") Signed-off-by: Coly Li Cc: stable@vger.kernel.org # 5.9+ Signed-off-by: Jens Axboe --- drivers/md/bcache/features.h | 15 +++++++++++++++ drivers/md/bcache/super.c | 14 ++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h index 32c5bbda2f0d..e73724c2b49b 100644 --- a/drivers/md/bcache/features.h +++ b/drivers/md/bcache/features.h @@ -79,6 +79,21 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \ BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET); +static inline bool bch_has_unknown_compat_features(struct cache_sb *sb) +{ + return ((sb->feature_compat & ~BCH_FEATURE_COMPAT_SUPP) != 0); +} + +static inline bool bch_has_unknown_ro_compat_features(struct cache_sb *sb) +{ + return ((sb->feature_ro_compat & ~BCH_FEATURE_RO_COMPAT_SUPP) != 0); +} + +static inline bool bch_has_unknown_incompat_features(struct cache_sb *sb) +{ + return ((sb->feature_incompat & ~BCH_FEATURE_INCOMPAT_SUPP) != 0); +} + int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size); int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size); int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6aa23a6fb394..f4674a3298af 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -228,6 +228,20 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, sb->feature_compat = le64_to_cpu(s->feature_compat); sb->feature_incompat = le64_to_cpu(s->feature_incompat); sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat); + + /* Check incompatible features */ + err = "Unsupported compatible feature found"; + if (bch_has_unknown_compat_features(sb)) + goto err; + + err = "Unsupported read-only compatible feature found"; + if (bch_has_unknown_ro_compat_features(sb)) + goto err; + + err = "Unsupported incompatible feature found"; + if (bch_has_unknown_incompat_features(sb)) + goto err; + err = read_super_common(sb, bdev, s); if (err) goto err; From b16671e8f493e3df40b1fb0dff4078f391c5099a Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 4 Jan 2021 15:41:21 +0800 Subject: [PATCH 539/547] bcache: introduce BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE for large bucket When large bucket feature was added, BCH_FEATURE_INCOMPAT_LARGE_BUCKET was introduced into the incompat feature set. It used bucket_size_hi (which was added at the tail of struct cache_sb_disk) to extend current 16bit bucket size to 32bit with existing bucket_size in struct cache_sb_disk. This is not a good idea, there are two obvious problems, - Bucket size is always value power of 2, if store log2(bucket size) in existing bucket_size of struct cache_sb_disk, it is unnecessary to add bucket_size_hi. - Macro csum_set() assumes d[SB_JOURNAL_BUCKETS] is the last member in struct cache_sb_disk, bucket_size_hi was added after d[] which makes csum_set calculate an unexpected super block checksum. To fix the above problems, this patch introduces a new incompat feature bit BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE, when this bit is set, it means bucket_size in struct cache_sb_disk stores the order of power-of-2 bucket size value. When user specifies a bucket size larger than 32768 sectors, BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE will be set to incompat feature set, and bucket_size stores log2(bucket size) more than store the real bucket size value. The obsoleted BCH_FEATURE_INCOMPAT_LARGE_BUCKET won't be used anymore, it is renamed to BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET and still only recognized by kernel driver for legacy compatible purpose. The previous bucket_size_hi is renmaed to obso_bucket_size_hi in struct cache_sb_disk and not used in bcache-tools anymore. For cache device created with BCH_FEATURE_INCOMPAT_LARGE_BUCKET feature, bcache-tools and kernel driver still recognize the feature string and display it as "obso_large_bucket". With this change, the unnecessary extra space extend of bcache on-disk super block can be avoided, and csum_set() may generate expected check sum as well. Fixes: ffa470327572 ("bcache: add bucket_size_hi into struct cache_sb_disk for large bucket") Signed-off-by: Coly Li Cc: stable@vger.kernel.org # 5.9+ Signed-off-by: Jens Axboe --- drivers/md/bcache/features.c | 2 +- drivers/md/bcache/features.h | 11 ++++++++--- drivers/md/bcache/super.c | 22 +++++++++++++++++++--- include/uapi/linux/bcache.h | 2 +- 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c index 6469223f0b77..d636b7b2d070 100644 --- a/drivers/md/bcache/features.c +++ b/drivers/md/bcache/features.c @@ -17,7 +17,7 @@ struct feature { }; static struct feature feature_list[] = { - {BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LARGE_BUCKET, + {BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE, "large_bucket"}, {0, 0, 0 }, }; diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h index e73724c2b49b..84fc2c0f0101 100644 --- a/drivers/md/bcache/features.h +++ b/drivers/md/bcache/features.h @@ -13,11 +13,15 @@ /* Feature set definition */ /* Incompat feature set */ -#define BCH_FEATURE_INCOMPAT_LARGE_BUCKET 0x0001 /* 32bit bucket size */ +/* 32bit bucket size, obsoleted */ +#define BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET 0x0001 +/* real bucket size is (1 << bucket_size) */ +#define BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE 0x0002 #define BCH_FEATURE_COMPAT_SUPP 0 #define BCH_FEATURE_RO_COMPAT_SUPP 0 -#define BCH_FEATURE_INCOMPAT_SUPP BCH_FEATURE_INCOMPAT_LARGE_BUCKET +#define BCH_FEATURE_INCOMPAT_SUPP (BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET| \ + BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE) #define BCH_HAS_COMPAT_FEATURE(sb, mask) \ ((sb)->feature_compat & (mask)) @@ -77,7 +81,8 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \ ~BCH##_FEATURE_INCOMPAT_##flagname; \ } -BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET); +BCH_FEATURE_INCOMPAT_FUNCS(obso_large_bucket, OBSO_LARGE_BUCKET); +BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LOG_LARGE_BUCKET_SIZE); static inline bool bch_has_unknown_compat_features(struct cache_sb *sb) { diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f4674a3298af..3999641f1775 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -64,9 +64,25 @@ static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s { unsigned int bucket_size = le16_to_cpu(s->bucket_size); - if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES && - bch_has_feature_large_bucket(sb)) - bucket_size |= le16_to_cpu(s->bucket_size_hi) << 16; + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + if (bch_has_feature_large_bucket(sb)) { + unsigned int max, order; + + max = sizeof(unsigned int) * BITS_PER_BYTE - 1; + order = le16_to_cpu(s->bucket_size); + /* + * bcache tool will make sure the overflow won't + * happen, an error message here is enough. + */ + if (order > max) + pr_err("Bucket size (1 << %u) overflows\n", + order); + bucket_size = 1 << order; + } else if (bch_has_feature_obso_large_bucket(sb)) { + bucket_size += + le16_to_cpu(s->obso_bucket_size_hi) << 16; + } + } return bucket_size; } diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 52e8bcb33981..cf7399f03b71 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -213,7 +213,7 @@ struct cache_sb_disk { __le16 keys; }; __le64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ - __le16 bucket_size_hi; + __le16 obso_bucket_size_hi; /* obsoleted */ }; /* From 5342fd4255021ef0c4ce7be52eea1c4ebda11c63 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 4 Jan 2021 15:41:22 +0800 Subject: [PATCH 540/547] bcache: set bcache device into read-only mode for BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET If BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET is set in incompat feature set, it means the cache device is created with obsoleted layout with obso_bucket_site_hi. Now bcache does not support this feature bit, a new BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE incompat feature bit is added for a better layout to support large bucket size. For the legacy compatibility purpose, if a cache device created with obsoleted BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET feature bit, all bcache devices attached to this cache set should be set to read-only. Then the dirty data can be written back to backing device before re-create the cache device with BCH_FEATURE_INCOMPAT_LOG_LARGE_BUCKET_SIZE feature bit by the latest bcache-tools. This patch checks BCH_FEATURE_INCOMPAT_OBSO_LARGE_BUCKET feature bit when running a cache set and attach a bcache device to the cache set. If this bit is set, - When run a cache set, print an error kernel message to indicate all following attached bcache device will be read-only. - When attach a bcache device, print an error kernel message to indicate the attached bcache device will be read-only, and ask users to update to latest bcache-tools. Such change is only for cache device whose bucket size >= 32MB, this is for the zoned SSD and almost nobody uses such large bucket size at this moment. If you don't explicit set a large bucket size for a zoned SSD, such change is totally transparent to your bcache device. Fixes: ffa470327572 ("bcache: add bucket_size_hi into struct cache_sb_disk for large bucket") Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 3999641f1775..2047a9cccdb5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1332,6 +1332,12 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, bcache_device_link(&dc->disk, c, "bdev"); atomic_inc(&c->attached_dev_nr); + if (bch_has_feature_obso_large_bucket(&(c->cache->sb))) { + pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n"); + pr_err("Please update to the latest bcache-tools to create the cache device\n"); + set_disk_ro(dc->disk.disk, 1); + } + /* Allow the writeback thread to proceed */ up_write(&dc->writeback_lock); @@ -1554,6 +1560,12 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) bcache_device_link(d, c, "volume"); + if (bch_has_feature_obso_large_bucket(&c->cache->sb)) { + pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n"); + pr_err("Please update to the latest bcache-tools to create the cache device\n"); + set_disk_ro(d->disk, 1); + } + return 0; err: kobject_put(&d->kobj); @@ -2113,6 +2125,9 @@ static int run_cache_set(struct cache_set *c) c->cache->sb.last_mount = (u32)ktime_get_real_seconds(); bcache_write_super(c); + if (bch_has_feature_obso_large_bucket(&c->cache->sb)) + pr_err("Detect obsoleted large bucket layout, all attached bcache device will be read-only\n"); + list_for_each_entry_safe(dc, t, &uncached_devices, list) bch_cached_dev_attach(dc, c, NULL); From 55e6ac1e1f31c7f678d9f3c8d54c6f102e5f1550 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 8 Jan 2021 20:57:22 +0000 Subject: [PATCH 541/547] io_uring: io_rw_reissue lockdep annotations We expect io_rw_reissue() to take place only during submission with uring_lock held. Add a lockdep annotation to check that invariant. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index cb57e0360fcb..55ba1922a349 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2692,6 +2692,8 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) return false; + lockdep_assert_held(&req->ctx->uring_lock); + ret = io_sq_thread_acquire_mm_files(req->ctx, req); if (io_resubmit_prep(req, ret)) { From 4f793dc40bc605b97624fd36baf085b3c35e8bfd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 8 Jan 2021 20:57:23 +0000 Subject: [PATCH 542/547] io_uring: inline io_uring_attempt_task_drop() A simple preparation change inlining io_uring_attempt_task_drop() into io_uring_flush(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 55ba1922a349..1c931e7a3948 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8964,23 +8964,6 @@ static void io_uring_del_task_file(struct file *file) fput(file); } -/* - * Drop task note for this file if we're the only ones that hold it after - * pending fput() - */ -static void io_uring_attempt_task_drop(struct file *file) -{ - if (!current->io_uring) - return; - /* - * fput() is pending, will be 2 if the only other ref is our potential - * task file note. If the task is exiting, drop regardless of count. - */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING) || - atomic_long_read(&file->f_count) == 2) - io_uring_del_task_file(file); -} - static void io_uring_remove_task_files(struct io_uring_task *tctx) { struct file *file; @@ -9072,7 +9055,17 @@ void __io_uring_task_cancel(void) static int io_uring_flush(struct file *file, void *data) { - io_uring_attempt_task_drop(file); + if (!current->io_uring) + return 0; + + /* + * fput() is pending, will be 2 if the only other ref is our potential + * task file note. If the task is exiting, drop regardless of count. + */ + if (fatal_signal_pending(current) || (current->flags & PF_EXITING) || + atomic_long_read(&file->f_count) == 2) + io_uring_del_task_file(file); + return 0; } From 6b5733eb638b7068ab7cb34e663b55a1d1892d85 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 8 Jan 2021 20:57:24 +0000 Subject: [PATCH 543/547] io_uring: add warn_once for io_uring_flush() files_cancel() should cancel all relevant requests and drop file notes, so we should never have file notes after that, including on-exit fput and flush. Add a WARN_ONCE to be sure. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1c931e7a3948..f39671a0d84f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9055,17 +9055,23 @@ void __io_uring_task_cancel(void) static int io_uring_flush(struct file *file, void *data) { - if (!current->io_uring) + struct io_uring_task *tctx = current->io_uring; + + if (!tctx) return 0; + /* we should have cancelled and erased it before PF_EXITING */ + WARN_ON_ONCE((current->flags & PF_EXITING) && + xa_load(&tctx->xa, (unsigned long)file)); + /* * fput() is pending, will be 2 if the only other ref is our potential * task file note. If the task is exiting, drop regardless of count. */ - if (fatal_signal_pending(current) || (current->flags & PF_EXITING) || - atomic_long_read(&file->f_count) == 2) - io_uring_del_task_file(file); + if (atomic_long_read(&file->f_count) != 2) + return 0; + io_uring_del_task_file(file); return 0; } From d9d05217cb6990b9a56e13b56e7a1b71e2551f6c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 8 Jan 2021 20:57:25 +0000 Subject: [PATCH 544/547] io_uring: stop SQPOLL submit on creator's death When the creator of SQPOLL io_uring dies (i.e. sqo_task), we don't want its internals like ->files and ->mm to be poked by the SQPOLL task, it have never been nice and recently got racy. That can happen when the owner undergoes destruction and SQPOLL tasks tries to submit new requests in parallel, and so calls io_sq_thread_acquire*(). That patch halts SQPOLL submissions when sqo_task dies by introducing sqo_dead flag. Once set, the SQPOLL task must not do any submission, which is synchronised by uring_lock as well as the new flag. The tricky part is to make sure that disabling always happens, that means either the ring is discovered by creator's do_exit() -> cancel, or if the final close() happens before it's done by the creator. The last is guaranteed by the fact that for SQPOLL the creator task and only it holds exactly one file note, so either it pins up to do_exit() or removed by the creator on the final put in flush. (see comments in uring_flush() around file->f_count == 2). One more place that can trigger io_sq_thread_acquire_*() is __io_req_task_submit(). Shoot off requests on sqo_dead there, even though actually we don't need to. That's because cancellation of sqo_task should wait for the request before going any further. note 1: io_disable_sqo_submit() does io_ring_set_wakeup_flag() so the caller would enter the ring to get an error, but it still doesn't guarantee that the flag won't be cleared. note 2: if final __userspace__ close happens not from the creator task, the file note will pin the ring until the task dies. Fixed: b1b6b5a30dce8 ("kernel/io_uring: cancel io_uring before task works") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 62 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f39671a0d84f..2f305c097bd5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -262,6 +262,7 @@ struct io_ring_ctx { unsigned int drain_next: 1; unsigned int eventfd_async: 1; unsigned int restricted: 1; + unsigned int sqo_dead: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -2160,12 +2161,11 @@ static void io_req_task_cancel(struct callback_head *cb) static void __io_req_task_submit(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - bool fail; - fail = __io_sq_thread_acquire_mm(ctx) || - __io_sq_thread_acquire_files(ctx); mutex_lock(&ctx->uring_lock); - if (!fail) + if (!ctx->sqo_dead && + !__io_sq_thread_acquire_mm(ctx) && + !__io_sq_thread_acquire_files(ctx)) __io_queue_sqe(req, NULL); else __io_req_task_cancel(req, -EFAULT); @@ -6954,7 +6954,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (!list_empty(&ctx->iopoll_list)) io_do_iopoll(ctx, &nr_events, 0); - if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs))) + if (to_submit && !ctx->sqo_dead && + likely(!percpu_ref_is_dying(&ctx->refs))) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); } @@ -8712,6 +8713,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); + + if (WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && !ctx->sqo_dead)) + ctx->sqo_dead = 1; + /* if force is set, the ring is going away. always drop after that */ ctx->cq_overflow_flushed = 1; if (ctx->rings) @@ -8874,6 +8879,18 @@ static void __io_uring_cancel_task_requests(struct io_ring_ctx *ctx, } } +static void io_disable_sqo_submit(struct io_ring_ctx *ctx) +{ + WARN_ON_ONCE(ctx->sqo_task != current); + + mutex_lock(&ctx->uring_lock); + ctx->sqo_dead = 1; + mutex_unlock(&ctx->uring_lock); + + /* make sure callers enter the ring to get error */ + io_ring_set_wakeup_flag(ctx); +} + /* * We need to iteratively cancel requests, in case a request has dependent * hard links. These persist even for failure of cancelations, hence keep @@ -8885,6 +8902,8 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, struct task_struct *task = current; if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { + /* for SQPOLL only sqo_task has task notes */ + io_disable_sqo_submit(ctx); task = ctx->sq_data->thread; atomic_inc(&task->io_uring->in_idle); io_sq_thread_park(ctx->sq_data); @@ -9056,6 +9075,7 @@ void __io_uring_task_cancel(void) static int io_uring_flush(struct file *file, void *data) { struct io_uring_task *tctx = current->io_uring; + struct io_ring_ctx *ctx = file->private_data; if (!tctx) return 0; @@ -9071,7 +9091,16 @@ static int io_uring_flush(struct file *file, void *data) if (atomic_long_read(&file->f_count) != 2) return 0; - io_uring_del_task_file(file); + if (ctx->flags & IORING_SETUP_SQPOLL) { + /* there is only one file note, which is owned by sqo_task */ + WARN_ON_ONCE((ctx->sqo_task == current) == + !xa_load(&tctx->xa, (unsigned long)file)); + + io_disable_sqo_submit(ctx); + } + + if (!(ctx->flags & IORING_SETUP_SQPOLL) || ctx->sqo_task == current) + io_uring_del_task_file(file); return 0; } @@ -9145,8 +9174,9 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, #endif /* !CONFIG_MMU */ -static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) +static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) { + int ret = 0; DEFINE_WAIT(wait); do { @@ -9155,6 +9185,11 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); + if (unlikely(ctx->sqo_dead)) { + ret = -EOWNERDEAD; + goto out; + } + if (!io_sqring_full(ctx)) break; @@ -9162,6 +9197,8 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) } while (!signal_pending(current)); finish_wait(&ctx->sqo_sq_wait, &wait); +out: + return ret; } static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, @@ -9235,10 +9272,16 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ctx->flags & IORING_SETUP_SQPOLL) { io_cqring_overflow_flush(ctx, false, NULL, NULL); + ret = -EOWNERDEAD; + if (unlikely(ctx->sqo_dead)) + goto out; if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); - if (flags & IORING_ENTER_SQ_WAIT) - io_sqpoll_wait_sq(ctx); + if (flags & IORING_ENTER_SQ_WAIT) { + ret = io_sqpoll_wait_sq(ctx); + if (ret) + goto out; + } submitted = to_submit; } else if (to_submit) { ret = io_uring_add_task_file(ctx, f.file); @@ -9665,6 +9708,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: + io_disable_sqo_submit(ctx); io_ring_ctx_wait_and_kill(ctx); return ret; } From 6bae85bd70d063b63fbe262d943cc321eab31b17 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 8 Jan 2021 22:46:02 -0800 Subject: [PATCH 545/547] maintainers: update my email address Change my email contact ahead of a likely painful eleven-month migration to a certain cobalt enteprisey groupware cloud product that will totally break my workflow. Some day I may get used to having to email being sequestered behind both claret and cerulean oath2+sms 2fa layers, but for now I'll stick with keying in one password to receive an email vs. the required four. Signed-off-by: Darrick J. Wong Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3d1b8fe97261..292394098e39 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9273,7 +9273,7 @@ F: drivers/net/ethernet/sgi/ioc3-eth.c IOMAP FILESYSTEM LIBRARY M: Christoph Hellwig -M: Darrick J. Wong +M: Darrick J. Wong M: linux-xfs@vger.kernel.org M: linux-fsdevel@vger.kernel.org L: linux-xfs@vger.kernel.org @@ -19504,7 +19504,7 @@ F: arch/x86/xen/*swiotlb* F: drivers/xen/*swiotlb* XFS FILESYSTEM -M: Darrick J. Wong +M: Darrick J. Wong M: linux-xfs@vger.kernel.org L: linux-xfs@vger.kernel.org S: Supported From 5625dcfbbcf892e40e8d60abbb5f56701a1d031c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 7 Jan 2021 17:12:08 +0530 Subject: [PATCH 546/547] Documentation: kbuild: Fix section reference Section 3.11 was incorrectly called 3.9, fix it. Signed-off-by: Viresh Kumar Signed-off-by: Masahiro Yamada --- Documentation/kbuild/makefiles.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index d36768cf1250..9f6a11881951 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -598,7 +598,7 @@ more details, with real examples. explicitly added to $(targets). Assignments to $(targets) are without $(obj)/ prefix. if_changed may be - used in conjunction with custom rules as defined in "3.9 Custom Rules". + used in conjunction with custom rules as defined in "3.11 Custom Rules". Note: It is a typical mistake to forget the FORCE prerequisite. Another common pitfall is that whitespace is sometimes significant; for From 7c53f6b671f4aba70ff15e1b05148b10d58c2837 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 10 Jan 2021 14:34:50 -0800 Subject: [PATCH 547/547] Linux 5.11-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8b2c3f88ee5e..9e73f82e0d86 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Kleptomaniac Octopus # *DOCUMENTATION*